Source code for cap2.pipeline.short_read.jellyfish


import luigi
import subprocess
import os
from os.path import join, dirname, basename, isfile

from ..utils.cap_task import CapTask
from ..constants import MASH_SKETCH_SIZE
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..preprocessing.clean_reads import CleanReads


[docs]class Jellyfish(CapTask): RAM = '1G' Ks = [31, 15] module_description = """ This module counts kmer within samples. Motivation: K-mer counts frequencies provide a database-free way to analyze samples and are the basis of many other analyses. Negatives: K-mer counts can be large and are sensitive to read errors. The latter is mitigated somewhat by read error correction. """ MODULE_VERSION = 'v0.2.0' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.pkg = CondaPackage( package="jellyfish", executable="jellyfish", channel="bioconda", config_filename=self.config_filename, ) self.config = PipelineConfig(self.config_filename) self.out_dir = self.config.out_dir self.reads = CleanReads.from_cap_task(self) def tool_version(self): return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8') @classmethod def _module_name(cls): return 'jellyfish' def requires(self): return self.pkg, self.reads @classmethod def dependencies(cls): return ['jellyfish', CleanReads] def output(self): return { f'k{K}': self.get_target(f'k{K}', 'jf') for K in self.Ks } def _cmd(self, k): outfile = self.output()[f'k{k}'].path r1 = self.reads.output()['clean_reads_1'].path r2 = self.reads.output()['clean_reads_2'].path cmd = ( f'{self.pkg.bin} count ' f'-m {k} ' f'-s {self.RAM} ' '-t 8 ' '-C ' f'-o {outfile} ' f'<(gunzip -c {r1}) ' f'<(gunzip -c {r2}) ' ) return cmd def _run(self): for K in self.Ks: self.run_cmd(self._cmd(K)) path = self.output()[f'k{K}'].path if isfile(path + '_0'): os.rename(path + '_0', path)