Source code for cap2.pipeline.assembly.metaspades
import luigi
import shutil
from os.path import join, dirname, basename, isdir
from shutil import rmtree
from ..utils.cap_task import CapTask
from ..config import PipelineConfig
from ..utils.conda import CondaPackage
from ..preprocessing import CleanReads
[docs]class MetaspadesAssembly(CapTask):
    module_description = """
    This module assembles reads into contigs.
    Motivation: Assembly can help to find large order genetic structures.
    Negatives: assembly is consistently being refined to be more efficient
    and effective. Misassemblies are possible.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pkg = CondaPackage(
            package="spades",
            executable="metaspades.py",
            channel="bioconda",
            config_filename=self.config_filename
        )
        self.reads = CleanReads(
            sample_name=self.sample_name,
            pe1=self.pe1,
            pe2=self.pe2,
            config_filename=self.config_filename
        )
        self.config = PipelineConfig(self.config_filename)
        self.exc = self.pkg.bin
        if self.config.exc_metaspades is not None:
            self.exc = self.config.exc_metaspades
    def tool_version(self):
        return self.run_cmd(f'{self.pkg.bin} --version').stderr.decode('utf-8')
    @classmethod
    def _module_name(cls):
        return 'metaspades'
    def requires(self):
        return self.pkg, self.reads
    @classmethod
    def version(cls):
        return 'v0.3.0'
    @classmethod
    def dependencies(cls):
        return ['spades', CleanReads]
    def output(self):
        return {
            'contigs': self.get_target('contigs', 'fasta'),
            'contig_paths': self.get_target('contigs', 'paths'),
            'scaffolds_fasta': self.get_target('scaffolds', 'fasta'),
            'scaffolds_paths': self.get_target('scaffolds', 'paths'),
            'fastg': self.get_target('graph', 'fastg'),
        }
    def _run(self):
        out_dir = f'{self.out_dir}/tmp_metaspades_out.{self.sample_name}'
        if isdir(out_dir):
            shutil.rmtree(out_dir)
        cmd = ''.join((
            self.exc,
            ' --only-assembler ',  # we start from error corrected reads
            ' -1 ', self.reads.output()["clean_reads_1"].path,
            ' -2 ', self.reads.output()["clean_reads_2"].path,
            f' -t {self.cores} ',
            ' -m 200 ',
            f' -o {out_dir}'
        ))
        self.run_cmd(cmd)
        pairs_to_move = [
            ('contigs.fasta', 'contigs'),
            ('scaffolds.fasta', 'scaffolds_fasta'),
            ('scaffolds.paths', 'scaffolds_paths'),
            ('assembly_graph.fastg', 'fastg'),
            ('contigs.paths', 'contig_paths'),
        ]
        out = self.output()
        for cur, new in pairs_to_move:
            cur = f'{out_dir}/{cur}'
            new = out[new].path
            self.run_cmd(f'mv {cur} {new}')
        rmtree(out_dir)