Source code for cap2.pipeline.short_read.humann2


import luigi
import subprocess
from os.path import join, dirname, basename

from ..utils.cap_task import CapTask
from ..config import PipelineConfig
from ..utils.conda import CondaPackage, PyPiPackage
from ..databases.uniref import Uniref90, HumannIdTable
from ..preprocessing.clean_reads import CleanReads


[docs]class MicaUniref90(CapTask): module_description = """ This module aligns reads to UniRef90 in preparation for functional profiling. Motivation: UniRef90 is a large database of functional genes. Negatives: Fairly slow. Many of the genes in UniRef90 are merely predicted functions. Note: this module currently uses Diamond not MiCA """ MODULE_VERSION = 'v0.2.0' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.pkg = CondaPackage( package="diamond", executable="diamond", channel="bioconda", config_filename=self.config_filename, ) self.config = PipelineConfig(self.config_filename) self.out_dir = self.config.out_dir self.db = Uniref90(config_filename=self.config_filename) self.reads = CleanReads.from_cap_task(self) def tool_version(self): return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8') def requires(self): return self.pkg, self.db, self.reads @classmethod def _module_name(cls): return 'diamond' @classmethod def dependencies(cls): return ['humann==3.0.0a4', 'diamond==0.9.32', Uniref90, CleanReads] def output(self): return { 'm8': self.get_target('uniref90', 'm8.gz'), } def _run(self): cmd = ( f'{self.pkg.bin} blastx ' f'--threads {self.cores} ' f'-d {self.db.diamond_index} ' f'-q {self.reads.output()["clean_reads_1"].path} ' '--block-size 6 ' f'| gzip > {self.output()["m8"].path} ' ) self.run_cmd(cmd)
[docs]class Humann2(CapTask): module_description = """ This module provides functional profiles of microbiomes. Motivation: Functional profiles are sometimes more stable than taxonomic profiles and can provided metabolic and adaptive insights. Negatives: Functional profiling is somewhat less well benchmarked than taxonomic profiling and metabolic pathways are often based on model organisms. Notes: This class is named Humann2 for historical reasons. It uses humann3 """ MODULE_VERSION = 'v0.3.0' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = PipelineConfig(self.config_filename) self.humann = PyPiPackage( package="humann==3.0.0a4", executable="humann", config_filename=self.config_filename, ) self.db = HumannIdTable.from_cap_task(self) self.out_dir = self.config.out_dir self.alignment = MicaUniref90.from_cap_task(self) def tool_version(self): return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8') def requires(self): return self.humann, self.alignment, self.db @classmethod def _module_name(cls): return 'humann' @classmethod def dependencies(cls): return ['humann==3.0.0a4', MicaUniref90, HumannIdTable] def output(self): return { 'genes': self.get_target('genes', 'tsv'), 'path_abunds': self.get_target('path_abunds', 'tsv'), 'path_covs': self.get_target('path_covs', 'tsv'), } def _run(self): odir = self.sample_name + '_humann2' genes = odir + '/*genefamilies.tsv' abunds = odir + '/*pathabundance.tsv' covs = odir + '/*pathcoverage.tsv' cmd = ( f'{self.humann.bin} ' f'--id-mapping {self.db.humann_id_table} ' f'--input {self.alignment.output()["m8"].path} ' f'--output {self.sample_name}_humann2 ; ' 'mv ' + genes + ' ' + self.output()['genes'].path + '; ' 'mv ' + abunds + ' ' + self.output()['path_abunds'].path + '; ' 'mv ' + covs + ' ' + self.output()['path_covs'].path + '; ' f'rm -r {self.sample_name}_humann2;' ) self.run_cmd(cmd)