#!/usr/bin/env python3
"""
Purpose
-------
This module is intended execute Spades on paired-end FastQ files.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``sample_id`` : Sample Identification string.
- e.g.: ``'SampleA'``
- ``fastq_pair`` : Pair of FastQ file paths.
- e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
- ``kmers`` : Setting for Spades kmers. Can be either ``'auto'``, \
``'default'`` or a user provided list.
- e.g.: ``'auto'`` or ``'default'`` or ``'55 77 99 113 127'``
- ``opts`` : List of options for spades execution.
1. The minimum number of reads to consider an edge in the de Bruijn \
graph during the assembly.
- e.g.: ``'5'``
2. Minimum contigs k-mer coverage.
- e.g.: ``['2' '2']``
- ``clear`` : If 'true', remove the input fastq files at the end of the
component run, IF THE FILES ARE IN THE WORK DIRECTORY
Generated output
----------------
- ``contigs.fasta`` : Main output of spades with the assembly
- e.g.: ``contigs.fasta``
- ``spades_status`` : Stores the status of the spades run. If it was \
successfully executed, it stores ``'pass'``. Otherwise, it stores the\
``STDERR`` message.
- e.g.: ``'pass'``
Code documentation
------------------
"""
__version__ = "1.0.2"
__build__ = "29062018"
__template__ = "spades-nf"
import os
import sys
import re
import subprocess
from subprocess import PIPE
from flowcraft_utils.flowcraft_base import get_logger, MainWrapper
logger = get_logger(__file__)
def __get_version_spades():
try:
cli = ["spades.py", "--version"]
p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
stdout, _ = p.communicate()
version = stdout.strip().split()[-1][1:].decode("utf8")
except Exception as e:
logger.debug(e)
version = "undefined"
return {
"program": "SPAdes",
"version": version,
}
if __file__.endswith(".command.sh"):
SAMPLE_ID = '$sample_id'
FASTQ_PAIR = '$fastq_pair'.split()
MAX_LEN = int('$max_len'.strip())
KMERS = '$kmers'.strip()
CLEAR = '$clear'
DISABLE_RR = '$disable_rr'
OPTS = [x.strip() for x in '$opts'.strip("[]").split(",")]
CLEAR = '$clear'
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
logger.debug("MAX_LEN: {}".format(MAX_LEN))
logger.debug("KMERS: {}".format(KMERS))
logger.debug("OPTS: {}".format(OPTS))
logger.debug("CLEAR: {}".format(CLEAR))
logger.debug("DISABLE_RR: {}".format(DISABLE_RR))
[docs]def set_kmers(kmer_opt, max_read_len):
"""Returns a kmer list based on the provided kmer option and max read len.
Parameters
----------
kmer_opt : str
The k-mer option. Can be either ``'auto'``, ``'default'`` or a
sequence of space separated integers, ``'23, 45, 67'``.
max_read_len : int
The maximum read length of the current sample.
Returns
-------
kmers : list
List of k-mer values that will be provided to Spades.
"""
logger.debug("Kmer option set to: {}".format(kmer_opt))
# Check if kmer option is set to auto
if kmer_opt == "auto":
if max_read_len >= 175:
kmers = [55, 77, 99, 113, 127]
else:
kmers = [21, 33, 55, 67, 77]
logger.debug("Kmer range automatically selected based on max read"
"length of {}: {}".format(max_read_len, kmers))
# Check if manual kmers were specified
elif len(kmer_opt.split()) > 1:
kmers = kmer_opt.split()
logger.debug("Kmer range manually set to: {}".format(kmers))
else:
kmers = []
logger.debug("Kmer range set to empty (will be automatically "
"determined by SPAdes")
return kmers
[docs]def clean_up(fastq):
"""
Cleans the temporary fastq files. If they are symlinks, the link
source is removed
Parameters
----------
fastq : list
List of fastq files.
"""
for fq in fastq:
# Get real path of fastq files, following symlinks
rp = os.path.realpath(fq)
logger.debug("Removing temporary fastq file path: {}".format(rp))
if re.match(".*/work/.{2}/.{30}/.*", rp):
os.remove(rp)
@MainWrapper
def main(sample_id, fastq_pair, max_len, kmer, opts, clear, disable_rr):
"""Main executor of the spades template.
Parameters
----------
sample_id : str
Sample Identification string.
fastq_pair : list
Two element list containing the paired FastQ files.
max_len : int
Maximum read length. This value is determined in
:py:class:`templates.integrity_coverage`
kmer : str
Can be either ``'auto'``, ``'default'`` or a
sequence of space separated integers, ``'23, 45, 67'``.
opts : List of options for spades execution. See above.
clear : str
Can be either 'true' or 'false'. If 'true', the input fastq files will
be removed at the end of the run, IF they are in the working directory
disable_rr : str
Can either be 'true' or 'false'. If 'true', disables repeat resolution
stage of assembling
"""
logger.info("Starting spades")
min_coverage, min_kmer_coverage = opts
logger.info("Setting SPAdes kmers")
kmers = set_kmers(kmer, max_len)
logger.info("SPAdes kmers set to: {}".format(kmers))
cli = [
"spades.py",
"--careful",
"--only-assembler",
"--threads",
"$task.cpus",
"--cov-cutoff",
min_coverage,
"-o",
"."
]
# Add kmers, if any were specified
if kmers:
cli += ["-k {}".format(",".join([str(x) for x in kmers]))]
# Add FastQ files
cli += [
"-1",
fastq_pair[0],
"-2",
fastq_pair[1]
]
# Disable RR?
if disable_rr == 'true':
cli += ['--disable-rr']
logger.debug("Running SPAdes subprocess with command: {}".format(cli))
p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
# Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
# string
try:
stderr = stderr.decode("utf8")
stdout = stdout.decode("utf8")
except (UnicodeDecodeError, AttributeError):
stderr = str(stderr)
stdout = str(stdout)
logger.info("Finished SPAdes subprocess with STDOUT:\\n"
"======================================\\n{}".format(stdout))
logger.info("Fished SPAdes subprocess with STDERR:\\n"
"======================================\\n{}".format(stderr))
logger.info("Finished SPAdes with return code: {}".format(
p.returncode))
with open(".status", "w") as fh:
if p.returncode != 0:
fh.write("error")
sys.exit(p.returncode)
else:
fh.write("pass")
# Change the default contigs.fasta assembly name to a more informative one
if "_trim." in fastq_pair[0]:
sample_id += "_trim"
# Get spades version for output name
info = __get_version_spades()
assembly_file = "{}_spades{}.fasta".format(
sample_id, info["version"].replace(".", ""))
os.rename("contigs.fasta", assembly_file)
logger.info("Setting main assembly file to: {}".format(assembly_file))
# Remove input fastq files when clear option is specified.
# Only remove temporary input when the expected output exists.
if clear == "true" and os.path.exists(assembly_file):
clean_up(fastq_pair)
if __name__ == '__main__':
main(SAMPLE_ID, FASTQ_PAIR, MAX_LEN, KMERS, OPTS, CLEAR, DISABLE_RR)