Source code for cap2.pipeline.databases.mouse_removal_db
import luigi
from os.path import join, dirname
from glob import glob
import subprocess
from os import makedirs
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..utils.cap_task import CapDbTask
MOUSE_GENOME_URL = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/635/GCA_000001635.9_GRCm39/GCA_000001635.9_GRCm39_genomic.fna.gz'
[docs]class MouseRemovalDB(CapDbTask):
"""This class is responsible for building and/or retriveing
validating the database which will be used to remove mouse
reads from the sample.
"""
config_filename = luigi.Parameter()
cores = luigi.IntParameter(default=1)
MODULE_VERSION = 'v1.0.0'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pkg = CondaPackage(
package="bowtie2==2.4.1",
executable="bowtie2-build",
channel="bioconda",
config_filename=self.config_filename,
)
self.config = PipelineConfig(self.config_filename)
self.db_dir = self.config.db_dir
self._fastas = []
def download_mouse_genome(self):
local_dir = join(self.config.db_dir, 'GRCm39')
makedirs(local_dir, exist_ok=True)
cmd = (
'wget '
f'--directory-prefix={local_dir} '
f'{MOUSE_GENOME_URL} '
)
self.run_cmd(cmd)
local_path = join(local_dir, 'GCA_000001635.9_GRCm39_genomic.fna.gz')
return local_path
@property
def fastas(self):
if self._fastas:
return self._fastas
local_path = self.download_mouse_genome()
self._fastas = [local_path]
return self._fastas
def tool_version(self):
return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8')
def requires(self):
return self.pkg
@classmethod
def _module_name(cls):
return 'bowtie_mouse_removal_db'
@classmethod
def dependencies(cls):
return ['bowtie2==2.4.1', 'GRCm39', '2020-10-21']
@property
def bowtie2_index(self):
return join(self.db_dir, 'GRCm39', 'mouse_removal.bt2')
def output(self):
index = luigi.LocalTarget(self.bowtie2_index + '.1.bt2')
index.makedirs()
return {
'bt2_index_1': index,
}
def build_bowtie2_index_from_fasta(self):
cmd = ''.join((
self.pkg.bin,
f' --threads {self.cores} ',
','.join(self.fastas),
' ',
self.bowtie2_index
))
subprocess.check_call(cmd, shell=True)
def download_bowtie2_index_from_s3(self):
paths = [
'cap2/databases/2020-06-08/hg38/hg38.fa.gz',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.1.bt2',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.2.bt2',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.3.bt2',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.4.bt2',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.rev.1.bt2',
'cap2/databases/2020-06-08/hg38/human_removal.bt2.rev.2.bt2',
]
for path in paths:
cmd = (
'wget '
f'--directory-prefix={dirname(self.output()["bt2_index_1"].path)} '
f'https://s3.wasabisys.com/metasub-microbiome/{path} '
)
self.run_cmd(cmd)
def run(self):
self.build_bowtie2_index_from_fasta()