Source code for snipgenie.app

#!/usr/bin/env python

"""
    snipgenie methods for cmd line tool.
    Created Nov 2019
    Copyright (C) Damien Farrell

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 3
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warroanty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

import sys,os,subprocess,glob,re
import time, datetime
import platform
import urllib, hashlib, shutil
import tempfile
import pandas as pd
import numpy as np
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO, AlignIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
#from Bio.Alphabet import generic_dna
from . import tools, aligners, trees
import multiprocessing as mp

tempdir = tempfile.gettempdir()
home = os.path.expanduser("~")
config_path = os.path.join(home,'.config','snipgenie')
module_path = os.path.dirname(os.path.abspath(__file__)) #path to module
datadir = os.path.join(module_path, 'data')
sequence_path = os.path.join(config_path, 'genome')
annotation_path = os.path.join(config_path, 'annotation')
mbovis_genome = os.path.join(sequence_path, 'Mbovis_AF212297.fa')
mtb_genome = os.path.join(sequence_path, 'MTB-H37Rv.fa')
mbovis_gb = os.path.join(datadir, 'Mbovis_AF212297.gb')
mtb_gb = os.path.join(datadir, 'MTB-H37Rv.gb')
map_genome = os.path.join(sequence_path, 'MAP-K10.fa')
map_gb = os.path.join(datadir, 'MAP-K10.gb')
msmeg_genome = os.path.join(sequence_path, 'Msmeg-MC2.fa')
msmeg_gb = os.path.join(datadir, 'Msmeg-MC2.gb')
mbovis_mask =  os.path.join(datadir, 'Mbovis_AF212297_mask.bed')
mtb_mask =  os.path.join(datadir, 'MTB-H37Rv_mask.bed')
sarscov2_genome = os.path.join(sequence_path, 'Sars-Cov-2.fa')
sarscov2_gb = os.path.join(datadir, 'Sars-Cov-2.gb')

preset_genomes = {
           'Mbovis-AF212297':{'sequence':mbovis_genome, 'gb':mbovis_gb, 'mask':mbovis_mask},
           'MTB-H37Rv':{'sequence':mtb_genome, 'gb':mtb_gb, 'mask':mtb_mask},
           'MAP-K10':{'sequence':map_genome, 'gb':map_gb},
           'M.smegmatis-MC2155':{'sequence':msmeg_genome, 'gb':msmeg_gb},
           'Sars-Cov-2':{'sequence':sarscov2_genome, 'gb':sarscov2_gb}
           }

#windows only path to binaries
bin_path = os.path.join(config_path, 'binaries')
#this is a custom filter
default_filter = 'QUAL>=40 && FORMAT/DP>=30 && DP4>=4'
annotatestr = '"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR"'

if not os.path.exists(config_path):
    try:
        os.makedirs(config_path, exist_ok=True)
    except:
        os.makedirs(config_path)

defaults = {'threads':4, 'labelsep':'_', 'labelindex':0,
            'trim':False, 'unmapped':False, 'quality':25,
            'aligner': 'bwa', 'platform': 'illumina', 'species': None,
            'filters': default_filter, 'custom_filters': False, 'mask': None,
            'reference': None, 'gb_file': None, 'overwrite':False,
            'omit_samples': [], 'get_stats':True,
            'buildtree':False, 'bootstraps':100}

[docs]def check_platform():
    """See if we are running in Windows"""

    if platform.system() == 'Windows':
        print('checking binaries are present')
        fetch_binaries()
    return

[docs]def copy_ref_genomes():
    """Copy default ref genome files to config dir"""

    files = glob.glob(os.path.join(datadir, '*.fa'))
    path = sequence_path
    if not os.path.exists(path):
        os.makedirs(path,exist_ok=True)
    for src in files:
        dest = os.path.join(path, os.path.basename(src))
        shutil.copy(src, dest)
    return

copy_ref_genomes()

[docs]def fetch_binaries():
    """Get windows binaries -- windows only"""

    url = "https://github.com/dmnfarrell/snipgenie/raw/master/win_binaries/"
    os.makedirs(bin_path, exist_ok=True)
    names = ['bcftools.exe','bwa.exe','samtools.exe','tabix.exe',
             'subread-align.exe','subread-buildindex.exe','fasttree.exe',
             'makeblastdb.exe','minimap2.exe','rush.exe',
             'msys-2.0.dll','msys-bz2-1.dll','msys-lzma-5.dll','msys-ncursesw6.dll','msys-z.dll']
    for n in names:
        filename = os.path.join(bin_path,n)
        if os.path.exists(filename):
            continue
        print ('fetching %s' %n)
        link = os.path.join(url,n)
        print (filename,link)
        urllib.request.urlretrieve(link, filename)
    return

[docs]def get_files_from_paths(paths, ext='*.f*q.gz', filter_list=None):
    """Get files in multiple paths.
    Args:
        ext: wildcard for file types to parse eg. *.f*q.gz]
        filter_list: list of labels that should be present in the filenames, optional
    """

    if not type(paths) == list:
        paths = [paths]
    files=[]
    for path in paths:
        if not os.path.exists(path):
            print ('the folder %s does not exist' %path)
        s = glob.glob(os.path.join(path,'**/'+ext), recursive=True)
        files.extend(s)
    found = []
    if filter_list != None:
        for f in files:
            for n in filter_list:
                if n in f:
                    found.append(f)
        files=found
    return files

[docs]def get_samples(filenames, sep='-', index=0):
    """Get sample pairs from list of files, usually fastq. This
     returns a dataframe of unique sample labels for the input and tries
     to recognise the paired files.
     Args:
        sep: separator to split name on
        index: placement of label in split list, default 0
     """

    res = []
    cols = ['name','sample','filename']
    for filename in filenames:
        name = os.path.basename(filename)#.split('.')[0]
        name = name.removesuffix('.fastq.gz')
        #make sure we remove pair numbers at end before getting sample
        if name[-2:] == '_1' or name[-2:] == '_2':
            label = name[:-2]
        else:
            label = name
        #print (label)
        sample = label.split(sep)[index]
        x = [name, sample, os.path.abspath(filename)]
        res.append(x)

    df = pd.DataFrame(res, columns=cols)
    df = df.sort_values(['sample','filename'])
    df['pair'] = df.groupby('sample').cumcount()+1
    df = df.drop_duplicates('filename')
    return df

[docs]def get_pivoted_samples(df):
    """Get pivoted samples by pair, returns a table with one sample per row and
       filenames in separate columns.
    """

    p = pd.pivot_table(df,index='sample',columns='pair',values=['filename','name'],
                        aggfunc='first')
    if len(p.columns) > 4:
        print ('error in filename parsing, check labelsep and labelindex options')
        print (df[:10])
        return
    c = list(zip(p.columns.get_level_values(0),p.columns.get_level_values(1)))
    p.columns = [i[0]+str(i[1]) for i in c]
    p = p.fillna('')
    p = p.reset_index()
    return p

[docs]def check_samples_unique(samples):
    """Check that sample names are unique"""

    x = samples[samples['sample'].duplicated()]
    if len(x)>0:
        return False

[docs]def write_samples(df, path):
    """Write out sample names only using dataframe from get_samples"""

    filename = os.path.join(path, 'samples.txt')
    df.to_csv(filename, index=False, header=False)
    return filename

[docs]def get_samples_from_bam(filenames, sep='-', index=0):
    """Samples from bam files"""

    res = []
    cols = ['name','sample','bam_file']
    for filename in filenames:
        name = os.path.basename(filename)
        label = name.removesuffix('.bam').split(sep)[index]
        x = [name, label, os.path.abspath(filename)]
        res.append(x)

    df = pd.DataFrame(res, columns=cols)
    df = df.sort_values(['sample','bam_file'])
    df['pair'] = df.groupby('sample').cumcount()+1
    df = df.drop_duplicates('bam_file')
    return df

[docs]def check_samples_aligned(samples, outdir):
    """Check how many samples already aligned"""

    found = glob.glob(os.path.join(outdir,'*.bam'))
    print ('%s/%s samples already aligned' %(len(found),len(samples)))
    return

[docs]def mapping_stats(samples):
    """Get stats on mapping of samples"""

    def get_stats(x):
        d = tools.samtools_flagstat(x)
        s = pd.Series(d)
        return s

    for i,r in samples.iterrows():
        s = get_stats(r.bam_file)
        samples.loc[i,'mapped'] = s['mapped']
        total = tools.get_fastq_size(r.filename1)
        #if 'filename2' in samples.columns:
        #    total += tools.get_fastq_size(r.filename1)
        samples.loc[i,'reads'] = total
        samples.loc[i,'perc_mapped'] = round(s['mapped']/total*100,2)
    return samples

[docs]def clean_bam_files(samples, path, remove=False):
    """Check if any bams in output not in samples and remove. Not used in workflow."""

    bams = get_files_from_paths(os.path.abspath(path), '*.bam')
    print ('%s bam files and %s samples found' %(len(bams),len(samples)))
    found = set(bams)-set(samples.bam_file)
    print ('bam files no longer present in samples:')
    print (found)
    if remove == True:
        for f in found:
            print ('removed %s' %f)
            os.remove(f)
    return

[docs]def fetch_contam_file():
    """Get contam sequences"""

    url = "https://github.com/dmnfarrell/snipgenie/raw/master/extra/contam.fa.gz"
    os.makedirs(bin_path, exist_ok=True)

    tempdir = tempfile.gettempdir()
    filename = os.path.join(tempdir,'contam.fa.gz')
    destfile = os.path.join(sequence_path,'contam.fa')
    if os.path.exists(destfile):
        return
    print ('fetching contaminant sequences..')
    #link = os.path.join(url,n)
    print (filename)
    urllib.request.urlretrieve(url, filename)
    tools.gunzip(filename, destfile)
    return

[docs]def blast_contaminants(filename, limit=2000, random=False, pident=98, qcovs=90):
    """Blast reads to contaminants database
    Returns: percentages of reads assigned to each species.
    """

    fetch_contam_file()
    path = os.path.join(sequence_path,'contam.fa')
    tools.make_blast_database(path)
    if random == True:
        seqs = tools.fastq_random_seqs(filename, limit)
    else:
        seqs = tools.fastq_to_rec(filename, limit)
    bl = tools.blast_sequences(path,seqs,maxseqs=1)
    bl['stitle'] = bl.stitle.apply(lambda x: x.split('__')[0])
    bl = bl[(bl.qcovs>qcovs) & (bl.pident>pident)]
    c = bl.stitle.value_counts()
    c = pd.DataFrame(c)
    c.columns = ['hits']
    c['perc_hits'] = c.hits/limit*100
    c=c[c.hits>5]
    #print (c)
    return c

[docs]def align_reads(df, idx, outdir='mapped', callback=None, aligner='bwa', platform='illumina',
                unmapped=None, **kwargs):
    """
    Align multiple files. Requires a dataframe with a 'sample' column to indicate
    paired files grouping. If a trimmed column is present these files will align_reads
    instead of the raw ones.
    Args:
        df: dataframe with sample names and filenames
        idx: index name
        outdir: output folder
        unmapped_dir: folder for unmapped files if required
    """

    if not os.path.exists(outdir):
        os.makedirs(outdir, exist_ok=True)
    if unmapped != None and not os.path.exists(unmapped):
        os.makedirs(unmapped, exist_ok=True)

    new = []
    samtoolscmd = tools.get_cmd('samtools')
    for i,r in df.iterrows():
        name = r['sample']
        if 'trimmed1' in df.columns:
            print('using trimmed files..')
            file1 = r.trimmed1
            file2 = r.trimmed2
        else:
            file1 = r.filename1
            if 'filename2' in df.columns:
                file2 = r.filename2
            else:
                file2 = None

        out = os.path.join(outdir,name+'.bam')
        if aligner == 'bwa':
            aligners.bwa_align(file1, file2, idx=idx, out=out, unmapped=unmapped, **kwargs)
        elif aligner == 'bowtie':
            idx = os.path.splitext(os.path.basename(idx))[0]
            aligners.bowtie_align(file1, file2, idx=idx, out=out, **kwargs)
        elif aligner == 'subread':
            idx = os.path.splitext(os.path.basename(idx))[0]
            aligners.subread_align(file1, file2, idx=idx, out=out, **kwargs)
        elif aligner == 'minimap2':
            #idx = os.path.splitext(os.path.basename(idx))[0]
            aligners.minimap2_align(file1, file2, idx=idx, out=out, platform=platform, **kwargs)
        bamidx = out+'.bai'
        if not os.path.exists(bamidx) or kwargs['overwrite']==True:
            print('aligning %s' %name)
            cmd = '{s} index {o}'.format(o=out,s=samtoolscmd)
            subprocess.check_output(cmd,shell=True)
            print (cmd)
        #index = df.index

        df.loc[i,'bam_file'] = os.path.abspath(out)
        #find mean depth/coverage
        if 'meandepth' not in df.columns or pd.isnull(df.loc[i,'meandepth']):
            cols = ['coverage','meandepth']
            c = tools.samtools_coverage(out)
            df.loc[i,cols] = c[cols]

    return df

[docs]def mpileup(bam_file, ref, out, overwrite=False):
    """Run bcftools for single file."""

    bcftoolscmd = tools.get_cmd('bcftools')
    if os.path.exists(out):
        return

    cmd = '{bc} mpileup -a {a} -O b --min-MQ 60 -o {o} -f {r} {b}'\
            .format(r=ref, b=bam_file, o=out, bc=bcftoolscmd, a=annotatestr)
    #print (cmd)
    subprocess.check_output(cmd, shell=True)
    cmd = 'bcftools index {o}'.format(o=out)
    subprocess.check_output(cmd, shell=True)
    return

[docs]def mpileup_region(region,out,bam_files,callback=None):
    """Run bcftools for single region."""

    bcftoolscmd = tools.get_cmd('bcftools')
    cmd = 'bcftools mpileup -r {reg} -O b -o {o} -f {r} {b}'.format(r=ref, reg=region, b=bam_files, o=out)
    if callback != None:
        callback(cmd)
    subprocess.check_output(cmd, shell=True)
    cmd = 'bcftools index {o}'.format(o=out)
    subprocess.check_output(cmd, shell=True)
    return

[docs]def worker(args):
    mpileup(args[0], args[1], args[2])

[docs]def mpileup_multiprocess(bam_files, ref, outpath, threads=4, callback=None):
    """Run mpileup in parallel over multiple files and make separate bcfs.
    Assumes alignment to a bacterial reference with a single chromosome."""

    bcftoolscmd = tools.get_cmd('bcftools')
    #size = len(bam_files)
    #pool = mp.Pool(threads)
    outfiles = []
    st = time.time()
    bcfpath = os.path.join(outpath,'bcf')
    if not os.path.exists(bcfpath):
        os.mkdir(bcfpath)
    for bam_file in bam_files:
        name = os.path.splitext(os.path.basename(bam_file))[0]
        out = '{o}/{f}.bcf'.format(o=bcfpath,f=name)
        outfiles.append(out)

    refs = [ref] * len(outfiles)
    data = list(zip(bam_files,refs,outfiles))
    #print (data)

    p = mp.Pool(threads)
    p.map_async(worker, data)
    p.close()
    p.join()

    t=time.time()-st
    print ('took %s seconds' %str(round(t,3)))
    rawbcf = os.path.join(outpath,'raw.bcf')
    bcf_files = ' '.join(outfiles)
    cmd = '{bc} merge --threads {t} -o {r} {b}'.format(b=bcf_files,r=rawbcf, bc=bcftoolscmd,t=threads)
    print (cmd)
    subprocess.check_output(cmd, shell=True)
    return rawbcf

[docs]def mpileup_parallel(bam_files, ref, outpath, threads=4, callback=None, tempdir=None):
    """Run mpileup in over multiple regions with GNU parallel on linux or rush on Windows
      Separate bcf files are then joined together.
      Assumes alignment to a bacterial reference with a single chromosome.
    """

    if tempdir == None:
        tempdir = tempfile.tempdir
    bam_files = ' '.join(bam_files)
    rawbcf = os.path.join(outpath,'raw.bcf')
    chr = tools.get_chrom(ref)
    length = tools.get_fasta_length(ref)
    x = np.linspace(1,length,threads+1,dtype=int)
    print (x)

    #split genome into blocks
    blocks=[]
    for i in range(len(x)):
        if i < len(x)-1:
            blocks.append((x[i],x[i+1]-1))

    #get temp outfile names
    outfiles = []
    regions = []
    for start,end in blocks:
        region = '"{c}":{s}-{e}'.format(c=chr,s=start,e=end)
        regions.append(region)
        out = os.path.join(tempdir,'{s}-{e}.bcf'.format(s=start,e=end))
        outfiles.append(out)

    regstr = ' '.join(regions)
    #print (regstr)
    filesstr = ' '.join(outfiles)
    bcftoolscmd = tools.get_cmd('bcftools')

    if platform.system() == 'Windows':
        rushcmd = tools.get_cmd('rush')
        cmd = 'echo {reg} | {rc} -D " " "{bc} mpileup -r {{}} -f {r} -a {a} --min-MQ 60 {b} -o {p}/{{@[^:]*$}}.bcf"'\
                .format(rc=rushcmd,bc=bcftoolscmd,reg=regstr,r=ref,b=bam_files,a=annotatestr,p=tempdir)
    else:
        cmd = 'parallel bcftools mpileup -r {{1}} -a {a} -O b --min-MQ 60 -o {{2}} -f {r} {b} ::: {reg} :::+ {o}'\
                .format(r=ref, reg=regstr, b=bam_files, o=filesstr, a=annotatestr)
    print (cmd)
    #if callback != None:
    #    callback(cmd)
    subprocess.check_output(cmd, shell=True)
    #concat the separate files
    cmd = '{bc} concat {i} -O b -o {o}'.format(bc=bcftoolscmd,i=' '.join(outfiles),o=rawbcf)
    print (cmd)
    subprocess.check_output(cmd, shell=True)
    #remove temp files
    for f in outfiles:
        os.remove(f)
    return rawbcf

[docs]def variant_calling(bam_files, ref, outpath, relabel=True, threads=4,
                    callback=None, overwrite=False, filters=None, gff_file=None,
                    mask=None, tempdir=None,
                    custom_filters=False, **kwargs):
    """Call variants with bcftools"""

    st = time.time()
    sample_file = os.path.join(outpath,'samples.txt')
    if filters == None:
        filters = default_filter
    rawbcf = os.path.join(outpath,'raw.bcf')
    bcftoolscmd = tools.get_cmd('bcftools')
    if not os.path.exists(rawbcf) or overwrite == True:
        print ('running mpileup..')
        '''if threads == 1:
            bam_files = ' '.join(bam_files)
            cmd = '{bc} mpileup -a {a} --max-depth 500 -O b --min-MQ 60 -o {o} -f {r} {b}'\
                .format(bc=bcftoolscmd,r=ref, b=bam_files, o=rawbcf, a=annotatestr)
            print (cmd)
            subprocess.check_output(cmd, shell=True)
        #or use mpileup in parallel to speed up
        else:
            rawbcf = mpileup_parallel(bam_files, ref, outpath, threads=threads,
                                        tempdir=tempdir, callback=callback)'''
        #new method
        rawbcf = mpileup_multiprocess(bam_files, ref, outpath, threads=threads,
                                         callback=callback)

    else:
        print ('%s already exists' %rawbcf)
        #check existing file samples here
        rawsamples = tools.get_vcf_samples(rawbcf)
        samples = pd.read_csv(sample_file,names=['name'])
        if len(samples) != len(rawsamples):
            print ('WARNING: samples in raw.bcf appear to be different to current samples.')
            print ('You may have added files since the previous run and will need to overwrite raw.bcf')

    #find snps only
    print ('calling variants..')
    vcfout = os.path.join(outpath,'calls.vcf')
    cmd = '{bc} call --ploidy 1 -m -v -o {o} {raw}'.format(bc=bcftoolscmd,o=vcfout,raw=rawbcf)
    #if callback != None:
    #    callback(cmd)
    print (cmd)
    subprocess.check_output(cmd,shell=True)

    #relabel samples in vcf header
    if relabel == True:
        relabel_vcfheader(vcfout, sample_file)

    #filters
    filtered = os.path.join(outpath,'filtered.vcf.gz')
    cmd = '{bc} filter -i "{f}" -o {o} -O z {i}'.format(bc=bcftoolscmd,i=vcfout,o=filtered,f=filters)
    print (cmd)
    tmp = subprocess.check_output(cmd,shell=True)
    if callback != None:
        callback(cmd)

    #get only snps
    print ('splitting snps and indels..')
    snpsout = os.path.join(outpath,'snps.vcf.gz')
    cmd = '{bc} view -v snps -o {o} -O z {i}'.format(bc=bcftoolscmd,o=snpsout,i=filtered)
    print (cmd)
    subprocess.check_output(cmd,shell=True)

    #also get indels only to separate file
    indelsout = os.path.join(outpath,'indels.vcf.gz')
    #cmd = '{bc} call -V snps --ploidy 1 -m -v -o {o} {raw}'.format(bc=bcftoolscmd,o=indelsout,raw=rawbcf)
    cmd = '{bc} view -v indels -o {o} -O z {i}'.format(bc=bcftoolscmd,o=indelsout,i=filtered)
    print (cmd)
    subprocess.check_output(cmd,shell=True)

    #apply mask if required
    if mask != None:
        mask_filter(snpsout, mask, outdir=outpath, overwrite=True)

    #custom filters
    if custom_filters == True:
        site_proximity_filter(snpsout, outdir=outpath, overwrite=True)

    #consequence calling
    if gff_file != None:
        print ('consequence calling..')
        try:
            csqout = os.path.join(outpath, 'csq.tsv')
            m = csq_call(ref, gff_file, snpsout, csqout)
            m.to_csv(os.path.join(outpath,'csq.matrix'))
            #indels as well
            csqout = os.path.join(outpath, 'csq_indels.tsv')
            m = csq_call(ref, gff_file, indelsout, csqout)
            m.to_csv(os.path.join(outpath,'csq_indels.matrix'))
        except Exception as e:
            print (e)
    print ('took %s seconds' %str(round(time.time()-st,0)))
    return snpsout

[docs]def csq_call(ref, gff_file, vcf_file, csqout):
    """Consequence calling"""

    bcftoolscmd = tools.get_cmd('bcftools')
    cmd = '{bc} csq -f {r} -g {g} {f} -Ot -o {o}'.format(bc=bcftoolscmd,r=ref,g=gff_file,
                f=vcf_file,o=csqout)
    print (cmd)
    #if callback != None:
    #    callback(cmd)
    tmp = subprocess.check_output(cmd,shell=True)
    csqdf = read_csq_file(csqout)
    #get presence/absence matrix of csq mutations
    m = get_aa_snp_matrix(csqdf)
    return m

[docs]def relabel_vcfheader(vcf_file, sample_file):
    """Re-label samples in vcf header"""

    bcftoolscmd = tools.get_cmd('bcftools')
    rlout = os.path.join(tempdir,'calls.vcf')
    cmd = '{bc} reheader --samples {s} -o {o} {v}'.format(bc=bcftoolscmd,o=rlout,
                                                v=vcf_file,s=sample_file)
    print(cmd)
    tmp = subprocess.check_output(cmd,shell=True)
    #rewrite file
    shutil.copy(rlout, vcf_file)
    #remove temp file
    os.remove(rlout)
    return

[docs]def mask_filter(vcf_file, mask_file, overwrite=False, outdir=None):
    """Remove any masked sites using a bed file, overwrites input"""

    print('using mask bed file', mask_file)
    mask = pd.read_csv(mask_file,sep='\t',names=['chrom','start','end'])
    #print (mask)
    def do_mask(x,i):
        #print (x.start, x.end, i)
        if (x.start<=i) & (x.end>=i):
            return 1
    import vcf
    vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
    sites = [record.POS for record in vcf_reader]
    print('%s sites' %len(sites))
    found = []
    for i in sites:
        m = mask.apply( lambda x: do_mask(x,i),1)
        m = m[m==1]
        if len(m)>0:
            #print (i)
            found.append(i)
    print('found %s sites in masked regions' %len(found))
    new = sorted(list(set(sites) - set(found)))
    if outdir == None:
        outdir = tempfile.gettempdir()
    if overwrite == True:
        overwrite_vcf(vcf_file, new, outdir)
    return

[docs]def site_proximity_filter(vcf_file, dist=10, overwrite=False, outdir=None):
    """Remove any pairs of sites within dist of each other.
    Args:
        vcf_file: input vcf file with positions to filter
        dist: distance threshold
        overwrite: whether to overwrite the vcf
    """

    #get vcf into dataframe
    df = tools.vcf_to_dataframe(vcf_file)
    df = df[df.REF != df.ALT]
    sites = list(df.pos.unique())
    found = []
    #check distances in sites per sample
    for s, g in df.groupby(['sample']):
        pos = list(g.pos)
        for i in range(len(pos)-1):
            if pos[i+1] - pos[i] <= dist:
                found.extend([pos[i], pos[i+1]])
    #all unique positions
    found = list(set(found))
    new = sorted(list(set(sites) - set(found)))
    print ('proximity filter removed %s/%s sites' %(len(found),len(sites)))
    if overwrite == True:
        overwrite_vcf(vcf_file, new, outdir)
    return

[docs]def overwrite_vcf(vcf_file, sites, outdir=None):
    """Make a new vcf with subset of sites"""

    if outdir == None:
        outdir = tempfile.gettempdir()

    import vcf
    out = os.path.join(outdir,'temp.vcf')
    vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
    vcf_writer = vcf.Writer(open(out, 'w'), vcf_reader)
    for record in vcf_reader:
        if record.POS in sites:
            #print (record)
            vcf_writer.write_record(record)
    vcf_writer.close()
    #copy or overwrite input vcf
    bcftoolscmd = tools.get_cmd('bcftools')
    cmd = '{b} view {o} -O z -o {gz}'.format(b=bcftoolscmd,o=out,gz=vcf_file)
    print (cmd)
    tmp = subprocess.check_output(cmd,shell=True)
    return

[docs]def trim_files(df, outpath, overwrite=False, threads=4, quality=30):
    """Batch trim fastq files"""

    method = 'cutadapt'
    if platform.system() == 'Windows':
        method = 'default'
    if not os.path.exists(outpath):
        os.makedirs(outpath, exist_ok=True)
    for i,row in df.iterrows():
        out1 = os.path.join(outpath, os.path.basename(row.filename1))
        out2 = os.path.join(outpath, os.path.basename(row.filename2))
        if not os.path.exists(out1) or overwrite == True:
            out1,out2 = tools.trim_reads(row.filename1, row.filename2,
                    outpath, threads=threads, quality=quality, method=method)

        df.loc[i,'trimmed1'] = out1
        df.loc[i,'trimmed2'] = out2
    return df

[docs]def read_csq_file(filename):
    """Read csq tsv outpt file into dataframe"""

    cols = ['1','sample','2','chrom','start','snp_type','gene','locus_tag','strand','feature_type','aa','nuc',]
    csqdf = pd.read_csv(filename,sep='[|\t]',comment='#',names=cols, engine='python')
    csqdf['aa'] = csqdf.aa.fillna(csqdf.snp_type)
    csqdf['nuc'] = csqdf.nuc.fillna(csqdf.snp_type)
    return csqdf

[docs]def get_aa_snp_matrix(df):
    """Get presence/absence matrix from csq calls table"""

    df = df.drop_duplicates(['gene','aa','sample'])
    x = df.set_index(['start','gene','aa','snp_type','sample'])['nuc'].unstack('sample')
    x[x.notna()] = 1
    x = x.fillna(0)
    return x

[docs]def run_bamfiles(bam_files, ref, gff_file=None, mask=None, outdir='.', threads=4,
                    sep='_', labelindex=0, samples=None, **kwargs):
    """
    Run workflow with bam files from a previous sets of alignments.
    We can arbitrarily combine results from multiple other runs this way.
    kwargs are passed to variant_calling method.
    Should write a samples.txt file in the outdir if vcf header is to be
    relabelled.
    Args:
        samples: dataframe of sample names, if not provided try to get from bam files
    """

    if not os.path.exists(outdir):
        os.makedirs(outdir, exist_ok=True)

    #write sample names if not provided
    if samples is None:
        samples = get_samples_from_bam(bam_files, sep=sep)
    write_samples(samples[['sample']], outdir)

    print ('%s samples were loaded:' %len(bam_files))
    vcf_file = variant_calling(bam_files, ref, outdir, threads=threads,
                                   relabel=True, gff_file=gff_file, mask=mask,
                                   **kwargs)

    snprecs, smat = tools.core_alignment_from_vcf(vcf_file)
    outfasta = os.path.join(outdir, 'core.fa')
    SeqIO.write(snprecs, outfasta, 'fasta')
    smat.to_csv(os.path.join(outdir,'core.txt'), sep=' ')
    aln = AlignIO.read(outfasta, 'fasta')
    #remove ref
    aln = aln[1:]
    snp_dist = tools.snp_dist_matrix(aln)
    snp_dist.to_csv(os.path.join(outdir,'snpdist.csv'), sep=',')
    treefile = trees.run_RAXML(outfasta, outpath=outdir)
    ls = len(smat)
    trees.convert_branch_lengths(treefile,os.path.join(outdir,'tree.newick'), ls)
    return

class Logger(object):
    """
    This class duplicates sys.stdout to a log file
    source: https://stackoverflow.com/q/616645
    """
    def __init__(self, filename="run.log", mode="a"):
        self.stdout = sys.stdout
        self.file = open(filename, mode)
        sys.stdout = self

    def __del__(self):
        self.close()

    def __enter__(self):
        pass

    def __exit__(self, *args):
        self.close()

    def write(self, message):
        self.stdout.write(message)
        self.file.write(message)

    def flush(self):
        self.stdout.flush()
        self.file.flush()
        os.fsync(self.file.fileno())

    def close(self):
        if self.stdout != None:
            sys.stdout = self.stdout
            self.stdout = None

        if self.file != None:
            self.file.close()
            self.file = None

[docs]class Logger(object):
    def __init__(self, logfile='log.dat'):
        self.terminal = sys.stdout
        self.log = open(logfile, "a")

[docs]    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

[docs]    def flush(self):
        return

[docs]class WorkFlow(object):
    """Class for implementing a prediction workflow from a set of options"""
    def __init__(self, **kwargs):
        for i in kwargs:
            self.__dict__[i] = kwargs[i]
        for i in defaults:
            if i not in self.__dict__:
                self.__dict__[i] = defaults[i]
        #make output folder
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir, exist_ok=True)
        #start logger
        self.logfile = os.path.join(self.outdir, 'run.log')
        #sys.stdout = Logger(self.logfile)
        print ('The following options were supplied')
        dt_string = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
        print("time: ", dt_string)
        print ('-------')
        for i in self.__dict__:
            print (i, ':', self.__dict__[i])
        print ()
        return

[docs]    def setup(self):
        """Setup main parameters"""

        if self.species != None:
            s = self.species
            if s not in preset_genomes:
                valid = '; '.join(list(preset_genomes.keys()))
                print ('Invalid species value! Use one of: %s' %valid)
                return
            self.reference = preset_genomes[s]['sequence']
            self.gb_file = preset_genomes[s]['gb']
            if s == 'Mbovis-AF212297':
                self.mask = mbovis_mask
        elif self.reference == None:
            self.reference = mbovis_genome
            self.gb_file = mbovis_gb

        self.filenames = get_files_from_paths(self.input)
        if self.threads == None:
            import multiprocessing
            self.threads = multiprocessing.cpu_count()
        else:
            self.threads = int(self.threads)
        df = get_samples(self.filenames, sep=self.labelsep, index=self.labelindex)
        df = get_pivoted_samples(df)
        if df is None:
            return
        if len(df) == 0:
            print ('no samples provided. files should be fastq.gz type')
            return False

        df['read_length'] = df.filename1.apply(tools.get_fastq_info)
        self.fastq_table = df
        sample_size = len(df['sample'].unique())
        print ('%s samples were loaded:' %sample_size)
        print ('----------------------')
        print (df)
        print ()
        s = check_samples_unique(df)
        if s == False:
            print ('samples names are not unique! try a different labelsep value.')
            return False
        print ('building index')
        if self.aligner == 'bwa':
            aligners.build_bwa_index(self.reference)
        elif self.aligner == 'bowtie':
            aligners.build_bowtie_index(self.reference)
        elif self.aligner == 'subread':
            aligners.build_subread_index(self.reference)
        if self.gb_file != None:
            #convert annotation to gff for consequence calling
            self.gff_file = os.path.join(self.outdir, os.path.basename(self.gb_file)+'.gff')
            tools.gff_bcftools_format(self.gb_file, self.gff_file)
        else:
            self.gff_file = None
        #set temp dir
        self.tempdir = os.path.join(self.outdir, 'tmp')
        if not os.path.exists(self.tempdir):
            os.makedirs(self.tempdir)
        time.sleep(1)
        return True

[docs]    def run(self):
        """Run workflow"""

        #this master table tracks our outputs
        samples = self.fastq_table
        if not os.path.exists(self.outdir):
            os.makedirs(self.outdir, exist_ok=True)
        write_samples(samples[['sample']], self.outdir)
        if len(samples)==0:
            print ('no samples found')
            return

        if self.trim == True:
            print ('trimming fastq files')
            print ('--------------------')
            trimmed_path = os.path.join(self.outdir, 'trimmed')
            samples = trim_files(samples, trimmed_path, self.overwrite,
                                  quality=self.quality, threads=self.threads)
            print ()
        print ('aligning files')
        print ('--------------')
        print ('Using reference genome: %s' %self.reference)
        path = os.path.join(self.outdir, 'mapped')
        if self.unmapped == True:
            unmapped = os.path.join(self.outdir, 'unmapped')
        else:
            unmapped = None
        check_samples_aligned(samples, path)
        samples = align_reads(samples, idx=self.reference, outdir=path,
                        aligner=self.aligner, platform=self.platform,
                        unmapped=unmapped,
                        threads=self.threads, overwrite=self.overwrite)

        lowdepth = samples[samples.meandepth<15]
        if len(lowdepth)>0:
            print ('%s samples have mean depth <15' %len(lowdepth))

        #mapping stats
        #if 'mapped' not in samples.columns and self.get_stats == True:
            #print ('getting mapping stats..')
            #samples = mapping_stats(samples)
        #save sample table
        samples.to_csv(os.path.join(self.outdir,'samples.csv'),index=False)

        print ()
        print ('calling variants')
        print ('----------------')
        bam_files = list(samples.bam_file.unique())
        self.vcf_file = variant_calling(bam_files, self.reference, self.outdir,
                                        threads=self.threads,
                                        gff_file=self.gff_file,
                                        filters=self.filters,
                                        mask=self.mask,
                                        custom_filters=self.custom_filters,
                                        overwrite=self.overwrite,
                                        tempdir=self.tempdir)
        print (self.vcf_file)
        print ()
        print ('making SNP matrix')
        print ('-----------------')
        snprecs, smat = tools.core_alignment_from_vcf(self.vcf_file, omit=self.omit_samples)
        outfasta = os.path.join(self.outdir, 'core.fa')
        SeqIO.write(snprecs, outfasta, 'fasta')
        #write out sites matrix as txt file
        smat.to_csv(os.path.join(self.outdir,'core.txt'), sep=' ')
        print ()
        #write out pairwise snp distances
        aln = AlignIO.read(outfasta, 'fasta')
        aln = aln[1:]
        snp_dist = tools.snp_dist_matrix(aln)
        snp_dist.to_csv(os.path.join(self.outdir,'snpdist.csv'), sep=',')

        print ('Done. Sample summary:')
        print ('---------------------')
        pd.set_option('display.max_rows', 150)
        print (samples.drop(columns=list(samples.filter(regex='filename'))))
        print ()

        if self.buildtree == True:
            print ('building tree')
            print ('-------------')
            if len(bam_files) <= 2:
                print ('Cannot build tree, too few samples.')
                return
            if platform.system() == 'Windows':
                treefile = trees.run_fasttree(outfasta, self.outdir)
            else:
                treefile = trees.run_RAXML(outfasta, threads=self.threads,
                            bootstraps=self.bootstraps, outpath=self.outdir)
            if treefile == None:
                return
            ls = len(smat)
            trees.convert_branch_lengths(treefile,os.path.join(self.outdir,'tree.newick'), ls)
        print ()
        #check unmapped reads

        return

[docs]def test_run():
    """Test run"""

    testdatadir = 'testing'
    out = 'subread_results'
    args = {'threads':4, 'outdir': out, 'input': testdatadir,
            'species':'Mbovis-AF212297',
            'aligner':'subread', 'filters':'QUAL>=40 && DP4>=4',
            'reference': None, 'overwrite':True}
    W = WorkFlow(**args)
    st = W.setup()
    if st == True:
        W.run()
    vdf = tools.vcf_to_dataframe(W.vcf_file)
    print (vdf)
    return

[docs]def main():
    "Run the application"

    import sys, os
    from argparse import ArgumentParser
    parser = ArgumentParser(description='snipgenie CLI tool. https://github.com/dmnfarrell/snipgenie')
    parser.add_argument("-i", "--input", action='append', dest="input", default=[],
                        help="input folder(s)", metavar="FILE")
    #parser.add_argument("-l", "--labels", dest="labels", default=[],
    #                    help="sample labels file, optional", metavar="FILE")
    parser.add_argument("-e", "--labelsep", dest="labelsep", default='_',
                        help="symbol to split the sample labels on")
    parser.add_argument("-x", "--labelindex", dest="labelindex", default=0,
                        help="position to extract label in split filenames")
    parser.add_argument("-r", "--reference", dest="reference", default=None,
                        help="reference genome filename", metavar="FILE")
    parser.add_argument("-S", "--species", dest="species", default=None,
                        help="set the species reference genome, overrides -r")
    parser.add_argument("-g", "--genbank_file", dest="gb_file", default=None,
                        help="annotation file, optional", metavar="FILE")
    parser.add_argument("-t", "--threads", dest="threads", default=4,
                        help="cpu threads to use")
    parser.add_argument("-w", "--overwrite", dest="overwrite", action="store_true", default=False,
                        help="overwrite intermediate files")
    parser.add_argument("-T", "--trim", dest="trim", action="store_true", default=False,
                        help="whether to trim fastq files" )
    parser.add_argument("-U", "--unmapped", dest="unmapped", action="store_true", default=False,
                        help="whether to save unmapped reads" )
    parser.add_argument("-Q", "--quality", dest="quality", default=25,
                        help="right trim quality, default 25")
    parser.add_argument("-f", "--filters", dest="filters", default=default_filter,
                        help="variant calling post-filters" )
    parser.add_argument("-m", "--mask", dest="mask", default=None,
                        help="mask regions from a bed file" )
    parser.add_argument("-c", "--custom", dest="custom_filters", action="store_true", default=False,
                        help="apply custom filters" )
    parser.add_argument("-p", "--platform", dest="platform", default='illumina',
                        help="sequencing platform, change to ont if using oxford nanopore")
    parser.add_argument("-a", "--aligner", dest="aligner", default='bwa',
                        help="aligner to use, bwa, subread, bowtie or minimap2")
    parser.add_argument("-b", "--buildtree", dest="buildtree", action="store_true", default=False,
                        help="whether to build a phylogenetic tree, requires RaXML" )
    parser.add_argument("-N", "--bootstraps", dest="bootstraps", default=100,
                        help="number of bootstraps to build tree")
    parser.add_argument("-o", "--outdir", dest="outdir",
                        help="Results folder", metavar="FILE")
    #parser.add_argument("-O", "--omit", dest="omit_samples",
    #                    help="List of sample names to omit of required", metavar="FILE")
    parser.add_argument("-q", "--qc", dest="qc", action="store_true",
                        help="Get version")
    parser.add_argument("-d", "--dummy", dest="dummy",  action="store_true",
                        default=False, help="Check samples but don't run")
    parser.add_argument("-X", "--test", dest="test",  action="store_true",
                        default=False, help="Test run")
    parser.add_argument("-v", "--version", dest="version", action="store_true",
                        help="Get version")

    args = vars(parser.parse_args())
    check_platform()

    if args['test'] == True:
        test_run()
    elif args['version'] == True:
        from . import __version__
        print ('snipgenie version %s' %__version__)
        print ('https://github.com/dmnfarrell/snipgenie')
    elif args['qc'] == True:
        print ('Running qc report')
        qcfile = 'qc_report.pdf'
        filenames = get_files_from_paths(args['input'])
        tools.pdf_qc_reports(filenames, qcfile)
    elif args['outdir'] == None:
        print ('No input or output folders provided. These are required.')
        print ('Example:')
        print ('snipgenie -r <reference> -i <input folder with fastq.gz files> -o <output folder>')
        print ('Use -h for more help on options.')
    else:
        W = WorkFlow(**args)
        st = W.setup()
        if st == True and args['dummy'] == False:
            W.run()

if __name__ == '__main__':
    main()