Source code for cap2.pipeline.preprocessing.map_to_human


import luigi
import subprocess
from os.path import join, dirname, basename

from .map_to_mouse import RemoveMouseReads
from ..utils.cap_task import CapTask
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..databases.human_removal_db import HumanRemovalDB


[docs]class RemoveHumanReads(CapTask): module_description = """ This module removes reads likely to be human. Motivation: nearly every metagenomic sample can contain human DNA some of which resembles microbial sequences. Removing likely human DNA decreases the chance of incorrectly mis-identifying human DNA as microbial. Negatives: in some samples the sequences that resemble human DNA may actually be microbial though in most cases part of the microbial genome will not resemble human. """ MODULE_VERSION = 'v0.2.1' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.pkg = CondaPackage( package="bowtie2", executable="bowtie2", channel="bioconda", config_filename=self.config_filename, ) self.samtools = CondaPackage( package="samtools=1.09", executable="samtools", channel="bioconda", config_filename=self.config_filename, ) self.config = PipelineConfig(self.config_filename) self.out_dir = self.config.out_dir self.db = HumanRemovalDB(config_filename=self.config_filename) self.mouse_removed_reads = RemoveMouseReads.from_cap_task(self) def requires(self): return self.samtools, self.pkg, self.db, self.mouse_removed_reads def tool_version(self): version = '[BOWTIE2]\n' version += self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8') version += '\n[SAMTOOLS]\n' version += self.run_cmd(f'{self.samtools.bin} --version').stderr.decode('utf-8') return version @classmethod def dependencies(cls): return ["samtools", "bowtie2", HumanRemovalDB, RemoveMouseReads] @classmethod def _module_name(cls): return 'remove_human' def output(self): out = { 'bam': self.get_target('human_alignment', 'bam'), 'nonhuman_reads_1': self.get_target('nonhuman_reads', 'R1.fastq.gz'), } if self.paired: out['nonhuman_reads_2'] = self.get_target('nonhuman_reads', 'R2.fastq.gz') return out def _run(self): if self.paired: return self._run_paired() return self._run_single() def _run_single(self): fastq_out = self.output()['nonhuman_reads_1'].path.replace('R1', 'R%') cmd = ''.join(( self.pkg.bin, ' -x ', self.db.bowtie2_index, ' -U ', self.mouse_removed_reads.output()['nonmouse_reads_1'].path, f' --un-gz ', self.output()['nonhuman_reads_1'].path, ' --threads ', str(self.cores), ' --very-sensitive ', f' | {self.samtools.bin} view -F 4 -b > ', self.output()['bam'].path, )) self.run_cmd(cmd) def _run_paired(self): fastq_out = self.output()['nonhuman_reads_1'].path.replace('R1', 'R%') cmd = ''.join(( self.pkg.bin, ' -x ', self.db.bowtie2_index, ' -1 ', self.mouse_removed_reads.output()['nonmouse_reads_1'].path, ' -2 ', self.mouse_removed_reads.output()['nonmouse_reads_2'].path, f' --un-conc-gz {fastq_out} ', ' --threads ', str(self.cores), ' --very-sensitive ', f' | {self.samtools.bin} view -F 4 -b > ', self.output()['bam'].path, )) self.run_cmd(cmd)