Source code for cap2.pipeline.short_read.mash
import luigi
import subprocess
from os.path import join, dirname, basename
from ..utils.cap_task import CapTask
from ..constants import MASH_SKETCH_SIZE
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..preprocessing.clean_reads import CleanReads
[docs]class Mash(CapTask):
module_description = """
This module provides small sketches of samples.
Motivation: MASH sketches provide an efficient way to compute the
distance between microbiome samples. Since MASH sketches are not
based on any database they aren't biased towards certain sample
types.
Negatives: Small MASH sketch sizes can obscure differences between
samples. As such this module produces two different sketch sizes.
"""
MODULE_VERSION = 'v0.2.0'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pkg = CondaPackage(
package="mash==2.2.2",
executable="mash",
channel="bioconda",
config_filename=self.config_filename,
)
self.config = PipelineConfig(self.config_filename)
self.out_dir = self.config.out_dir
self.reads = CleanReads.from_cap_task(self)
def tool_version(self):
return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8')
@classmethod
def _module_name(cls):
return 'mash'
def requires(self):
return self.pkg, self.reads
@classmethod
def dependencies(cls):
return ['mash==2.2.2', CleanReads]
def output(self):
return {
'10M_mash_sketch': self.get_target('10M_sketch', 'msh'),
'10K_mash_sketch': self.get_target('10K_sketch', 'msh'),
}
def _cmd(self, mash_sketch_size, out_key):
cmd = (
f'{self.pkg.bin} '
f'sketch -s {mash_sketch_size} '
f'-o {self.output()[out_key].path[:-4]} '
f'{self.reads.output()["clean_reads_1"].path}'
)
return cmd
def _run(self):
self.run_cmd(self._cmd(10 * 1000, '10K_mash_sketch'))
self.run_cmd(self._cmd(10 * 1000 * 1000, '10M_mash_sketch'))