Source code for cap2.pipeline.preprocessing.remove_adapters


import luigi
import subprocess
from os.path import join, dirname, basename

from ..utils.cap_task import CapTask
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..databases.human_removal_db import HumanRemovalDB
from .base_reads import BaseReads


[docs]class AdapterRemoval(CapTask): module_description = """ This module removes adapter sequences and low wuality sequences. Motivation: adapter sequences can be misidentified or lead to issues with assembly or k-mer profiles. Removing adapters is fast and reduces this issue. Negatives: adapter sequences may not always be properly identified. """ ILLUMINA_SHARED_PREFIX = 'AGATCGGAAGAGC' MODULE_VERSION = 'v0.2.1' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.pkg = CondaPackage( package="adapterremoval", executable="AdapterRemoval", channel="bioconda", config_filename=self.config_filename, ) self.reads = BaseReads.from_cap_task(self) self.config = PipelineConfig(self.config_filename) self.out_dir = self.config.out_dir self.adapter1 = self.ILLUMINA_SHARED_PREFIX self.adapter2 = self.ILLUMINA_SHARED_PREFIX def requires(self): return self.pkg, self.reads def tool_version(self): return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8') @classmethod def dependencies(cls): return ["adapterremoval", BaseReads] @classmethod def _module_name(cls): return 'adapter_removal' def get_run_metadata(self): blob = super().get_run_metadata() blob['adapter_sequences'] = {'adapter1': self.adapter1, 'adapter2': self.adapter2} return blob def output(self): out = { 'adapter_removed_reads_1': self.get_target('adapter_removed', 'R1.fastq.gz'), 'settings': self.get_target('settings', 'txt'), } if self.paired: out['adapter_removed_reads_2'] = self.get_target('adapter_removed', 'R2.fastq.gz') return out def _run(self): if self.paired: return self._run_paired() return self._run_single() def _run_single(self): basename = f'ar_temp_{self.sample_name}' cmd = ( f'{self.pkg.bin} ' f'--file1 {self.reads.output()["base_reads_1"].path} ' '--trimns ' '--trimqualities ' '--gzip ' f'--adapter1 {self.adapter1} ' f'--output1 {self.output()["adapter_removed_reads_1"].path} ' f'--settings {self.output()["settings"].path} ' f'--basename {basename} ' '--minquality 2 ' f'--threads {self.cores} ' '; ' f'rm {basename}*' ) self.run_cmd(cmd) def _run_paired(self): basename = f'ar_temp_{self.sample_name}' cmd = ( f'{self.pkg.bin} ' f'--file1 {self.reads.output()["base_reads_1"].path} ' f'--file2 {self.reads.output()["base_reads_2"].path} ' '--trimns ' '--trimqualities ' '--gzip ' f'--adapter1 {self.adapter1} ' f'--adapter1 {self.adapter2} ' f'--output1 {self.output()["adapter_removed_reads_1"].path} ' f'--output2 {self.output()["adapter_removed_reads_2"].path} ' f'--settings {self.output()["settings"].path} ' f'--basename {basename} ' '--minquality 2 ' f'--threads {self.cores} ' '; ' f'rm {basename}*' ) self.run_cmd(cmd)