#!/usr/bin/env python
"""
snipgenie methods for cmd line tool.
Created Nov 2019
Copyright (C) Damien Farrell
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 3
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warroanty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
import sys,os,subprocess,glob,re
import time, datetime
import platform
import urllib, hashlib, shutil
import tempfile
import pandas as pd
import numpy as np
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO, AlignIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
#from Bio.Alphabet import generic_dna
from . import tools, aligners, trees
import multiprocessing as mp
tempdir = tempfile.gettempdir()
home = os.path.expanduser("~")
config_path = os.path.join(home,'.config','snipgenie')
module_path = os.path.dirname(os.path.abspath(__file__)) #path to module
datadir = os.path.join(module_path, 'data')
sequence_path = os.path.join(config_path, 'genome')
annotation_path = os.path.join(config_path, 'annotation')
mbovis_genome = os.path.join(sequence_path, 'Mbovis_AF212297.fa')
mtb_genome = os.path.join(sequence_path, 'MTB-H37Rv.fa')
mbovis_gb = os.path.join(datadir, 'Mbovis_AF212297.gb')
mtb_gb = os.path.join(datadir, 'MTB-H37Rv.gb')
map_genome = os.path.join(sequence_path, 'MAP-K10.fa')
map_gb = os.path.join(datadir, 'MAP-K10.gb')
msmeg_genome = os.path.join(sequence_path, 'Msmeg-MC2.fa')
msmeg_gb = os.path.join(datadir, 'Msmeg-MC2.gb')
mbovis_mask = os.path.join(datadir, 'Mbovis_AF212297_mask.bed')
mtb_mask = os.path.join(datadir, 'MTB-H37Rv_mask.bed')
sarscov2_genome = os.path.join(sequence_path, 'Sars-Cov-2.fa')
sarscov2_gb = os.path.join(datadir, 'Sars-Cov-2.gb')
preset_genomes = {
'Mbovis-AF212297':{'sequence':mbovis_genome, 'gb':mbovis_gb, 'mask':mbovis_mask},
'MTB-H37Rv':{'sequence':mtb_genome, 'gb':mtb_gb, 'mask':mtb_mask},
'MAP-K10':{'sequence':map_genome, 'gb':map_gb},
'M.smegmatis-MC2155':{'sequence':msmeg_genome, 'gb':msmeg_gb},
'Sars-Cov-2':{'sequence':sarscov2_genome, 'gb':sarscov2_gb}
}
#windows only path to binaries
bin_path = os.path.join(config_path, 'binaries')
#this is a custom filter
default_filter = 'QUAL>=40 && FORMAT/DP>=30 && DP4>=4'
annotatestr = '"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR"'
if not os.path.exists(config_path):
try:
os.makedirs(config_path, exist_ok=True)
except:
os.makedirs(config_path)
defaults = {'threads':4, 'labelsep':'_', 'labelindex':0,
'trim':False, 'unmapped':False, 'quality':25,
'aligner': 'bwa', 'platform': 'illumina', 'species': None,
'filters': default_filter, 'custom_filters': False, 'mask': None,
'reference': None, 'gb_file': None, 'overwrite':False,
'omit_samples': [], 'get_stats':True,
'buildtree':False, 'bootstraps':100}
[docs]def copy_ref_genomes():
"""Copy default ref genome files to config dir"""
files = glob.glob(os.path.join(datadir, '*.fa'))
path = sequence_path
if not os.path.exists(path):
os.makedirs(path,exist_ok=True)
for src in files:
dest = os.path.join(path, os.path.basename(src))
shutil.copy(src, dest)
return
copy_ref_genomes()
[docs]def fetch_binaries():
"""Get windows binaries -- windows only"""
url = "https://github.com/dmnfarrell/snipgenie/raw/master/win_binaries/"
os.makedirs(bin_path, exist_ok=True)
names = ['bcftools.exe','bwa.exe','samtools.exe','tabix.exe',
'subread-align.exe','subread-buildindex.exe','fasttree.exe',
'makeblastdb.exe','minimap2.exe','rush.exe',
'msys-2.0.dll','msys-bz2-1.dll','msys-lzma-5.dll','msys-ncursesw6.dll','msys-z.dll']
for n in names:
filename = os.path.join(bin_path,n)
if os.path.exists(filename):
continue
print ('fetching %s' %n)
link = os.path.join(url,n)
print (filename,link)
urllib.request.urlretrieve(link, filename)
return
[docs]def get_files_from_paths(paths, ext='*.f*q.gz', filter_list=None):
"""Get files in multiple paths.
Args:
ext: wildcard for file types to parse eg. *.f*q.gz]
filter_list: list of labels that should be present in the filenames, optional
"""
if not type(paths) == list:
paths = [paths]
files=[]
for path in paths:
if not os.path.exists(path):
print ('the folder %s does not exist' %path)
s = glob.glob(os.path.join(path,'**/'+ext), recursive=True)
files.extend(s)
found = []
if filter_list != None:
for f in files:
for n in filter_list:
if n in f:
found.append(f)
files=found
return files
[docs]def get_samples(filenames, sep='-', index=0):
"""Get sample pairs from list of files, usually fastq. This
returns a dataframe of unique sample labels for the input and tries
to recognise the paired files.
Args:
sep: separator to split name on
index: placement of label in split list, default 0
"""
res = []
cols = ['name','sample','filename']
for filename in filenames:
name = os.path.basename(filename)#.split('.')[0]
name = name.removesuffix('.fastq.gz')
#make sure we remove pair numbers at end before getting sample
if name[-2:] == '_1' or name[-2:] == '_2':
label = name[:-2]
else:
label = name
#print (label)
sample = label.split(sep)[index]
x = [name, sample, os.path.abspath(filename)]
res.append(x)
df = pd.DataFrame(res, columns=cols)
df = df.sort_values(['sample','filename'])
df['pair'] = df.groupby('sample').cumcount()+1
df = df.drop_duplicates('filename')
return df
[docs]def get_pivoted_samples(df):
"""Get pivoted samples by pair, returns a table with one sample per row and
filenames in separate columns.
"""
p = pd.pivot_table(df,index='sample',columns='pair',values=['filename','name'],
aggfunc='first')
if len(p.columns) > 4:
print ('error in filename parsing, check labelsep and labelindex options')
print (df[:10])
return
c = list(zip(p.columns.get_level_values(0),p.columns.get_level_values(1)))
p.columns = [i[0]+str(i[1]) for i in c]
p = p.fillna('')
p = p.reset_index()
return p
[docs]def check_samples_unique(samples):
"""Check that sample names are unique"""
x = samples[samples['sample'].duplicated()]
if len(x)>0:
return False
[docs]def write_samples(df, path):
"""Write out sample names only using dataframe from get_samples"""
filename = os.path.join(path, 'samples.txt')
df.to_csv(filename, index=False, header=False)
return filename
[docs]def get_samples_from_bam(filenames, sep='-', index=0):
"""Samples from bam files"""
res = []
cols = ['name','sample','bam_file']
for filename in filenames:
name = os.path.basename(filename)
label = name.removesuffix('.bam').split(sep)[index]
x = [name, label, os.path.abspath(filename)]
res.append(x)
df = pd.DataFrame(res, columns=cols)
df = df.sort_values(['sample','bam_file'])
df['pair'] = df.groupby('sample').cumcount()+1
df = df.drop_duplicates('bam_file')
return df
[docs]def check_samples_aligned(samples, outdir):
"""Check how many samples already aligned"""
found = glob.glob(os.path.join(outdir,'*.bam'))
print ('%s/%s samples already aligned' %(len(found),len(samples)))
return
[docs]def mapping_stats(samples):
"""Get stats on mapping of samples"""
def get_stats(x):
d = tools.samtools_flagstat(x)
s = pd.Series(d)
return s
for i,r in samples.iterrows():
s = get_stats(r.bam_file)
samples.loc[i,'mapped'] = s['mapped']
total = tools.get_fastq_size(r.filename1)
#if 'filename2' in samples.columns:
# total += tools.get_fastq_size(r.filename1)
samples.loc[i,'reads'] = total
samples.loc[i,'perc_mapped'] = round(s['mapped']/total*100,2)
return samples
[docs]def clean_bam_files(samples, path, remove=False):
"""Check if any bams in output not in samples and remove. Not used in workflow."""
bams = get_files_from_paths(os.path.abspath(path), '*.bam')
print ('%s bam files and %s samples found' %(len(bams),len(samples)))
found = set(bams)-set(samples.bam_file)
print ('bam files no longer present in samples:')
print (found)
if remove == True:
for f in found:
print ('removed %s' %f)
os.remove(f)
return
[docs]def fetch_contam_file():
"""Get contam sequences"""
url = "https://github.com/dmnfarrell/snipgenie/raw/master/extra/contam.fa.gz"
os.makedirs(bin_path, exist_ok=True)
tempdir = tempfile.gettempdir()
filename = os.path.join(tempdir,'contam.fa.gz')
destfile = os.path.join(sequence_path,'contam.fa')
if os.path.exists(destfile):
return
print ('fetching contaminant sequences..')
#link = os.path.join(url,n)
print (filename)
urllib.request.urlretrieve(url, filename)
tools.gunzip(filename, destfile)
return
[docs]def blast_contaminants(filename, limit=2000, random=False, pident=98, qcovs=90):
"""Blast reads to contaminants database
Returns: percentages of reads assigned to each species.
"""
fetch_contam_file()
path = os.path.join(sequence_path,'contam.fa')
tools.make_blast_database(path)
if random == True:
seqs = tools.fastq_random_seqs(filename, limit)
else:
seqs = tools.fastq_to_rec(filename, limit)
bl = tools.blast_sequences(path,seqs,maxseqs=1)
bl['stitle'] = bl.stitle.apply(lambda x: x.split('__')[0])
bl = bl[(bl.qcovs>qcovs) & (bl.pident>pident)]
c = bl.stitle.value_counts()
c = pd.DataFrame(c)
c.columns = ['hits']
c['perc_hits'] = c.hits/limit*100
c=c[c.hits>5]
#print (c)
return c
[docs]def align_reads(df, idx, outdir='mapped', callback=None, aligner='bwa', platform='illumina',
unmapped=None, **kwargs):
"""
Align multiple files. Requires a dataframe with a 'sample' column to indicate
paired files grouping. If a trimmed column is present these files will align_reads
instead of the raw ones.
Args:
df: dataframe with sample names and filenames
idx: index name
outdir: output folder
unmapped_dir: folder for unmapped files if required
"""
if not os.path.exists(outdir):
os.makedirs(outdir, exist_ok=True)
if unmapped != None and not os.path.exists(unmapped):
os.makedirs(unmapped, exist_ok=True)
new = []
samtoolscmd = tools.get_cmd('samtools')
for i,r in df.iterrows():
name = r['sample']
if 'trimmed1' in df.columns:
print('using trimmed files..')
file1 = r.trimmed1
file2 = r.trimmed2
else:
file1 = r.filename1
if 'filename2' in df.columns:
file2 = r.filename2
else:
file2 = None
out = os.path.join(outdir,name+'.bam')
if aligner == 'bwa':
aligners.bwa_align(file1, file2, idx=idx, out=out, unmapped=unmapped, **kwargs)
elif aligner == 'bowtie':
idx = os.path.splitext(os.path.basename(idx))[0]
aligners.bowtie_align(file1, file2, idx=idx, out=out, **kwargs)
elif aligner == 'subread':
idx = os.path.splitext(os.path.basename(idx))[0]
aligners.subread_align(file1, file2, idx=idx, out=out, **kwargs)
elif aligner == 'minimap2':
#idx = os.path.splitext(os.path.basename(idx))[0]
aligners.minimap2_align(file1, file2, idx=idx, out=out, platform=platform, **kwargs)
bamidx = out+'.bai'
if not os.path.exists(bamidx) or kwargs['overwrite']==True:
print('aligning %s' %name)
cmd = '{s} index {o}'.format(o=out,s=samtoolscmd)
subprocess.check_output(cmd,shell=True)
print (cmd)
#index = df.index
df.loc[i,'bam_file'] = os.path.abspath(out)
#find mean depth/coverage
if 'meandepth' not in df.columns or pd.isnull(df.loc[i,'meandepth']):
cols = ['coverage','meandepth']
c = tools.samtools_coverage(out)
df.loc[i,cols] = c[cols]
return df
[docs]def mpileup(bam_file, ref, out, overwrite=False):
"""Run bcftools for single file."""
bcftoolscmd = tools.get_cmd('bcftools')
if os.path.exists(out):
return
cmd = '{bc} mpileup -a {a} -O b --min-MQ 60 -o {o} -f {r} {b}'\
.format(r=ref, b=bam_file, o=out, bc=bcftoolscmd, a=annotatestr)
#print (cmd)
subprocess.check_output(cmd, shell=True)
cmd = 'bcftools index {o}'.format(o=out)
subprocess.check_output(cmd, shell=True)
return
[docs]def mpileup_region(region,out,bam_files,callback=None):
"""Run bcftools for single region."""
bcftoolscmd = tools.get_cmd('bcftools')
cmd = 'bcftools mpileup -r {reg} -O b -o {o} -f {r} {b}'.format(r=ref, reg=region, b=bam_files, o=out)
if callback != None:
callback(cmd)
subprocess.check_output(cmd, shell=True)
cmd = 'bcftools index {o}'.format(o=out)
subprocess.check_output(cmd, shell=True)
return
[docs]def worker(args):
mpileup(args[0], args[1], args[2])
[docs]def mpileup_multiprocess(bam_files, ref, outpath, threads=4, callback=None):
"""Run mpileup in parallel over multiple files and make separate bcfs.
Assumes alignment to a bacterial reference with a single chromosome."""
bcftoolscmd = tools.get_cmd('bcftools')
#size = len(bam_files)
#pool = mp.Pool(threads)
outfiles = []
st = time.time()
bcfpath = os.path.join(outpath,'bcf')
if not os.path.exists(bcfpath):
os.mkdir(bcfpath)
for bam_file in bam_files:
name = os.path.splitext(os.path.basename(bam_file))[0]
out = '{o}/{f}.bcf'.format(o=bcfpath,f=name)
outfiles.append(out)
refs = [ref] * len(outfiles)
data = list(zip(bam_files,refs,outfiles))
#print (data)
p = mp.Pool(threads)
p.map_async(worker, data)
p.close()
p.join()
t=time.time()-st
print ('took %s seconds' %str(round(t,3)))
rawbcf = os.path.join(outpath,'raw.bcf')
bcf_files = ' '.join(outfiles)
cmd = '{bc} merge --threads {t} -o {r} {b}'.format(b=bcf_files,r=rawbcf, bc=bcftoolscmd,t=threads)
print (cmd)
subprocess.check_output(cmd, shell=True)
return rawbcf
[docs]def mpileup_parallel(bam_files, ref, outpath, threads=4, callback=None, tempdir=None):
"""Run mpileup in over multiple regions with GNU parallel on linux or rush on Windows
Separate bcf files are then joined together.
Assumes alignment to a bacterial reference with a single chromosome.
"""
if tempdir == None:
tempdir = tempfile.tempdir
bam_files = ' '.join(bam_files)
rawbcf = os.path.join(outpath,'raw.bcf')
chr = tools.get_chrom(ref)
length = tools.get_fasta_length(ref)
x = np.linspace(1,length,threads+1,dtype=int)
print (x)
#split genome into blocks
blocks=[]
for i in range(len(x)):
if i < len(x)-1:
blocks.append((x[i],x[i+1]-1))
#get temp outfile names
outfiles = []
regions = []
for start,end in blocks:
region = '"{c}":{s}-{e}'.format(c=chr,s=start,e=end)
regions.append(region)
out = os.path.join(tempdir,'{s}-{e}.bcf'.format(s=start,e=end))
outfiles.append(out)
regstr = ' '.join(regions)
#print (regstr)
filesstr = ' '.join(outfiles)
bcftoolscmd = tools.get_cmd('bcftools')
if platform.system() == 'Windows':
rushcmd = tools.get_cmd('rush')
cmd = 'echo {reg} | {rc} -D " " "{bc} mpileup -r {{}} -f {r} -a {a} --min-MQ 60 {b} -o {p}/{{@[^:]*$}}.bcf"'\
.format(rc=rushcmd,bc=bcftoolscmd,reg=regstr,r=ref,b=bam_files,a=annotatestr,p=tempdir)
else:
cmd = 'parallel bcftools mpileup -r {{1}} -a {a} -O b --min-MQ 60 -o {{2}} -f {r} {b} ::: {reg} :::+ {o}'\
.format(r=ref, reg=regstr, b=bam_files, o=filesstr, a=annotatestr)
print (cmd)
#if callback != None:
# callback(cmd)
subprocess.check_output(cmd, shell=True)
#concat the separate files
cmd = '{bc} concat {i} -O b -o {o}'.format(bc=bcftoolscmd,i=' '.join(outfiles),o=rawbcf)
print (cmd)
subprocess.check_output(cmd, shell=True)
#remove temp files
for f in outfiles:
os.remove(f)
return rawbcf
[docs]def variant_calling(bam_files, ref, outpath, relabel=True, threads=4,
callback=None, overwrite=False, filters=None, gff_file=None,
mask=None, tempdir=None,
custom_filters=False, **kwargs):
"""Call variants with bcftools"""
st = time.time()
sample_file = os.path.join(outpath,'samples.txt')
if filters == None:
filters = default_filter
rawbcf = os.path.join(outpath,'raw.bcf')
bcftoolscmd = tools.get_cmd('bcftools')
if not os.path.exists(rawbcf) or overwrite == True:
print ('running mpileup..')
'''if threads == 1:
bam_files = ' '.join(bam_files)
cmd = '{bc} mpileup -a {a} --max-depth 500 -O b --min-MQ 60 -o {o} -f {r} {b}'\
.format(bc=bcftoolscmd,r=ref, b=bam_files, o=rawbcf, a=annotatestr)
print (cmd)
subprocess.check_output(cmd, shell=True)
#or use mpileup in parallel to speed up
else:
rawbcf = mpileup_parallel(bam_files, ref, outpath, threads=threads,
tempdir=tempdir, callback=callback)'''
#new method
rawbcf = mpileup_multiprocess(bam_files, ref, outpath, threads=threads,
callback=callback)
else:
print ('%s already exists' %rawbcf)
#check existing file samples here
rawsamples = tools.get_vcf_samples(rawbcf)
samples = pd.read_csv(sample_file,names=['name'])
if len(samples) != len(rawsamples):
print ('WARNING: samples in raw.bcf appear to be different to current samples.')
print ('You may have added files since the previous run and will need to overwrite raw.bcf')
#find snps only
print ('calling variants..')
vcfout = os.path.join(outpath,'calls.vcf')
cmd = '{bc} call --ploidy 1 -m -v -o {o} {raw}'.format(bc=bcftoolscmd,o=vcfout,raw=rawbcf)
#if callback != None:
# callback(cmd)
print (cmd)
subprocess.check_output(cmd,shell=True)
#relabel samples in vcf header
if relabel == True:
relabel_vcfheader(vcfout, sample_file)
#filters
filtered = os.path.join(outpath,'filtered.vcf.gz')
cmd = '{bc} filter -i "{f}" -o {o} -O z {i}'.format(bc=bcftoolscmd,i=vcfout,o=filtered,f=filters)
print (cmd)
tmp = subprocess.check_output(cmd,shell=True)
if callback != None:
callback(cmd)
#get only snps
print ('splitting snps and indels..')
snpsout = os.path.join(outpath,'snps.vcf.gz')
cmd = '{bc} view -v snps -o {o} -O z {i}'.format(bc=bcftoolscmd,o=snpsout,i=filtered)
print (cmd)
subprocess.check_output(cmd,shell=True)
#also get indels only to separate file
indelsout = os.path.join(outpath,'indels.vcf.gz')
#cmd = '{bc} call -V snps --ploidy 1 -m -v -o {o} {raw}'.format(bc=bcftoolscmd,o=indelsout,raw=rawbcf)
cmd = '{bc} view -v indels -o {o} -O z {i}'.format(bc=bcftoolscmd,o=indelsout,i=filtered)
print (cmd)
subprocess.check_output(cmd,shell=True)
#apply mask if required
if mask != None:
mask_filter(snpsout, mask, outdir=outpath, overwrite=True)
#custom filters
if custom_filters == True:
site_proximity_filter(snpsout, outdir=outpath, overwrite=True)
#consequence calling
if gff_file != None:
print ('consequence calling..')
try:
csqout = os.path.join(outpath, 'csq.tsv')
m = csq_call(ref, gff_file, snpsout, csqout)
m.to_csv(os.path.join(outpath,'csq.matrix'))
#indels as well
csqout = os.path.join(outpath, 'csq_indels.tsv')
m = csq_call(ref, gff_file, indelsout, csqout)
m.to_csv(os.path.join(outpath,'csq_indels.matrix'))
except Exception as e:
print (e)
print ('took %s seconds' %str(round(time.time()-st,0)))
return snpsout
[docs]def csq_call(ref, gff_file, vcf_file, csqout):
"""Consequence calling"""
bcftoolscmd = tools.get_cmd('bcftools')
cmd = '{bc} csq -f {r} -g {g} {f} -Ot -o {o}'.format(bc=bcftoolscmd,r=ref,g=gff_file,
f=vcf_file,o=csqout)
print (cmd)
#if callback != None:
# callback(cmd)
tmp = subprocess.check_output(cmd,shell=True)
csqdf = read_csq_file(csqout)
#get presence/absence matrix of csq mutations
m = get_aa_snp_matrix(csqdf)
return m
[docs]def mask_filter(vcf_file, mask_file, overwrite=False, outdir=None):
"""Remove any masked sites using a bed file, overwrites input"""
print('using mask bed file', mask_file)
mask = pd.read_csv(mask_file,sep='\t',names=['chrom','start','end'])
#print (mask)
def do_mask(x,i):
#print (x.start, x.end, i)
if (x.start<=i) & (x.end>=i):
return 1
import vcf
vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
sites = [record.POS for record in vcf_reader]
print('%s sites' %len(sites))
found = []
for i in sites:
m = mask.apply( lambda x: do_mask(x,i),1)
m = m[m==1]
if len(m)>0:
#print (i)
found.append(i)
print('found %s sites in masked regions' %len(found))
new = sorted(list(set(sites) - set(found)))
if outdir == None:
outdir = tempfile.gettempdir()
if overwrite == True:
overwrite_vcf(vcf_file, new, outdir)
return
[docs]def site_proximity_filter(vcf_file, dist=10, overwrite=False, outdir=None):
"""Remove any pairs of sites within dist of each other.
Args:
vcf_file: input vcf file with positions to filter
dist: distance threshold
overwrite: whether to overwrite the vcf
"""
#get vcf into dataframe
df = tools.vcf_to_dataframe(vcf_file)
df = df[df.REF != df.ALT]
sites = list(df.pos.unique())
found = []
#check distances in sites per sample
for s, g in df.groupby(['sample']):
pos = list(g.pos)
for i in range(len(pos)-1):
if pos[i+1] - pos[i] <= dist:
found.extend([pos[i], pos[i+1]])
#all unique positions
found = list(set(found))
new = sorted(list(set(sites) - set(found)))
print ('proximity filter removed %s/%s sites' %(len(found),len(sites)))
if overwrite == True:
overwrite_vcf(vcf_file, new, outdir)
return
[docs]def overwrite_vcf(vcf_file, sites, outdir=None):
"""Make a new vcf with subset of sites"""
if outdir == None:
outdir = tempfile.gettempdir()
import vcf
out = os.path.join(outdir,'temp.vcf')
vcf_reader = vcf.Reader(open(vcf_file, 'rb'))
vcf_writer = vcf.Writer(open(out, 'w'), vcf_reader)
for record in vcf_reader:
if record.POS in sites:
#print (record)
vcf_writer.write_record(record)
vcf_writer.close()
#copy or overwrite input vcf
bcftoolscmd = tools.get_cmd('bcftools')
cmd = '{b} view {o} -O z -o {gz}'.format(b=bcftoolscmd,o=out,gz=vcf_file)
print (cmd)
tmp = subprocess.check_output(cmd,shell=True)
return
[docs]def trim_files(df, outpath, overwrite=False, threads=4, quality=30):
"""Batch trim fastq files"""
method = 'cutadapt'
if platform.system() == 'Windows':
method = 'default'
if not os.path.exists(outpath):
os.makedirs(outpath, exist_ok=True)
for i,row in df.iterrows():
out1 = os.path.join(outpath, os.path.basename(row.filename1))
out2 = os.path.join(outpath, os.path.basename(row.filename2))
if not os.path.exists(out1) or overwrite == True:
out1,out2 = tools.trim_reads(row.filename1, row.filename2,
outpath, threads=threads, quality=quality, method=method)
df.loc[i,'trimmed1'] = out1
df.loc[i,'trimmed2'] = out2
return df
[docs]def read_csq_file(filename):
"""Read csq tsv outpt file into dataframe"""
cols = ['1','sample','2','chrom','start','snp_type','gene','locus_tag','strand','feature_type','aa','nuc',]
csqdf = pd.read_csv(filename,sep='[|\t]',comment='#',names=cols, engine='python')
csqdf['aa'] = csqdf.aa.fillna(csqdf.snp_type)
csqdf['nuc'] = csqdf.nuc.fillna(csqdf.snp_type)
return csqdf
[docs]def get_aa_snp_matrix(df):
"""Get presence/absence matrix from csq calls table"""
df = df.drop_duplicates(['gene','aa','sample'])
x = df.set_index(['start','gene','aa','snp_type','sample'])['nuc'].unstack('sample')
x[x.notna()] = 1
x = x.fillna(0)
return x
[docs]def run_bamfiles(bam_files, ref, gff_file=None, mask=None, outdir='.', threads=4,
sep='_', labelindex=0, samples=None, **kwargs):
"""
Run workflow with bam files from a previous sets of alignments.
We can arbitrarily combine results from multiple other runs this way.
kwargs are passed to variant_calling method.
Should write a samples.txt file in the outdir if vcf header is to be
relabelled.
Args:
samples: dataframe of sample names, if not provided try to get from bam files
"""
if not os.path.exists(outdir):
os.makedirs(outdir, exist_ok=True)
#write sample names if not provided
if samples is None:
samples = get_samples_from_bam(bam_files, sep=sep)
write_samples(samples[['sample']], outdir)
print ('%s samples were loaded:' %len(bam_files))
vcf_file = variant_calling(bam_files, ref, outdir, threads=threads,
relabel=True, gff_file=gff_file, mask=mask,
**kwargs)
snprecs, smat = tools.core_alignment_from_vcf(vcf_file)
outfasta = os.path.join(outdir, 'core.fa')
SeqIO.write(snprecs, outfasta, 'fasta')
smat.to_csv(os.path.join(outdir,'core.txt'), sep=' ')
aln = AlignIO.read(outfasta, 'fasta')
#remove ref
aln = aln[1:]
snp_dist = tools.snp_dist_matrix(aln)
snp_dist.to_csv(os.path.join(outdir,'snpdist.csv'), sep=',')
treefile = trees.run_RAXML(outfasta, outpath=outdir)
ls = len(smat)
trees.convert_branch_lengths(treefile,os.path.join(outdir,'tree.newick'), ls)
return
class Logger(object):
"""
This class duplicates sys.stdout to a log file
source: https://stackoverflow.com/q/616645
"""
def __init__(self, filename="run.log", mode="a"):
self.stdout = sys.stdout
self.file = open(filename, mode)
sys.stdout = self
def __del__(self):
self.close()
def __enter__(self):
pass
def __exit__(self, *args):
self.close()
def write(self, message):
self.stdout.write(message)
self.file.write(message)
def flush(self):
self.stdout.flush()
self.file.flush()
os.fsync(self.file.fileno())
def close(self):
if self.stdout != None:
sys.stdout = self.stdout
self.stdout = None
if self.file != None:
self.file.close()
self.file = None
[docs]class Logger(object):
def __init__(self, logfile='log.dat'):
self.terminal = sys.stdout
self.log = open(logfile, "a")
[docs] def write(self, message):
self.terminal.write(message)
self.log.write(message)
[docs] def flush(self):
return
[docs]class WorkFlow(object):
"""Class for implementing a prediction workflow from a set of options"""
def __init__(self, **kwargs):
for i in kwargs:
self.__dict__[i] = kwargs[i]
for i in defaults:
if i not in self.__dict__:
self.__dict__[i] = defaults[i]
#make output folder
if not os.path.exists(self.outdir):
os.makedirs(self.outdir, exist_ok=True)
#start logger
self.logfile = os.path.join(self.outdir, 'run.log')
#sys.stdout = Logger(self.logfile)
print ('The following options were supplied')
dt_string = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print("time: ", dt_string)
print ('-------')
for i in self.__dict__:
print (i, ':', self.__dict__[i])
print ()
return
[docs] def setup(self):
"""Setup main parameters"""
if self.species != None:
s = self.species
if s not in preset_genomes:
valid = '; '.join(list(preset_genomes.keys()))
print ('Invalid species value! Use one of: %s' %valid)
return
self.reference = preset_genomes[s]['sequence']
self.gb_file = preset_genomes[s]['gb']
if s == 'Mbovis-AF212297':
self.mask = mbovis_mask
elif self.reference == None:
self.reference = mbovis_genome
self.gb_file = mbovis_gb
self.filenames = get_files_from_paths(self.input)
if self.threads == None:
import multiprocessing
self.threads = multiprocessing.cpu_count()
else:
self.threads = int(self.threads)
df = get_samples(self.filenames, sep=self.labelsep, index=self.labelindex)
df = get_pivoted_samples(df)
if df is None:
return
if len(df) == 0:
print ('no samples provided. files should be fastq.gz type')
return False
df['read_length'] = df.filename1.apply(tools.get_fastq_info)
self.fastq_table = df
sample_size = len(df['sample'].unique())
print ('%s samples were loaded:' %sample_size)
print ('----------------------')
print (df)
print ()
s = check_samples_unique(df)
if s == False:
print ('samples names are not unique! try a different labelsep value.')
return False
print ('building index')
if self.aligner == 'bwa':
aligners.build_bwa_index(self.reference)
elif self.aligner == 'bowtie':
aligners.build_bowtie_index(self.reference)
elif self.aligner == 'subread':
aligners.build_subread_index(self.reference)
if self.gb_file != None:
#convert annotation to gff for consequence calling
self.gff_file = os.path.join(self.outdir, os.path.basename(self.gb_file)+'.gff')
tools.gff_bcftools_format(self.gb_file, self.gff_file)
else:
self.gff_file = None
#set temp dir
self.tempdir = os.path.join(self.outdir, 'tmp')
if not os.path.exists(self.tempdir):
os.makedirs(self.tempdir)
time.sleep(1)
return True
[docs] def run(self):
"""Run workflow"""
#this master table tracks our outputs
samples = self.fastq_table
if not os.path.exists(self.outdir):
os.makedirs(self.outdir, exist_ok=True)
write_samples(samples[['sample']], self.outdir)
if len(samples)==0:
print ('no samples found')
return
if self.trim == True:
print ('trimming fastq files')
print ('--------------------')
trimmed_path = os.path.join(self.outdir, 'trimmed')
samples = trim_files(samples, trimmed_path, self.overwrite,
quality=self.quality, threads=self.threads)
print ()
print ('aligning files')
print ('--------------')
print ('Using reference genome: %s' %self.reference)
path = os.path.join(self.outdir, 'mapped')
if self.unmapped == True:
unmapped = os.path.join(self.outdir, 'unmapped')
else:
unmapped = None
check_samples_aligned(samples, path)
samples = align_reads(samples, idx=self.reference, outdir=path,
aligner=self.aligner, platform=self.platform,
unmapped=unmapped,
threads=self.threads, overwrite=self.overwrite)
lowdepth = samples[samples.meandepth<15]
if len(lowdepth)>0:
print ('%s samples have mean depth <15' %len(lowdepth))
#mapping stats
#if 'mapped' not in samples.columns and self.get_stats == True:
#print ('getting mapping stats..')
#samples = mapping_stats(samples)
#save sample table
samples.to_csv(os.path.join(self.outdir,'samples.csv'),index=False)
print ()
print ('calling variants')
print ('----------------')
bam_files = list(samples.bam_file.unique())
self.vcf_file = variant_calling(bam_files, self.reference, self.outdir,
threads=self.threads,
gff_file=self.gff_file,
filters=self.filters,
mask=self.mask,
custom_filters=self.custom_filters,
overwrite=self.overwrite,
tempdir=self.tempdir)
print (self.vcf_file)
print ()
print ('making SNP matrix')
print ('-----------------')
snprecs, smat = tools.core_alignment_from_vcf(self.vcf_file, omit=self.omit_samples)
outfasta = os.path.join(self.outdir, 'core.fa')
SeqIO.write(snprecs, outfasta, 'fasta')
#write out sites matrix as txt file
smat.to_csv(os.path.join(self.outdir,'core.txt'), sep=' ')
print ()
#write out pairwise snp distances
aln = AlignIO.read(outfasta, 'fasta')
aln = aln[1:]
snp_dist = tools.snp_dist_matrix(aln)
snp_dist.to_csv(os.path.join(self.outdir,'snpdist.csv'), sep=',')
print ('Done. Sample summary:')
print ('---------------------')
pd.set_option('display.max_rows', 150)
print (samples.drop(columns=list(samples.filter(regex='filename'))))
print ()
if self.buildtree == True:
print ('building tree')
print ('-------------')
if len(bam_files) <= 2:
print ('Cannot build tree, too few samples.')
return
if platform.system() == 'Windows':
treefile = trees.run_fasttree(outfasta, self.outdir)
else:
treefile = trees.run_RAXML(outfasta, threads=self.threads,
bootstraps=self.bootstraps, outpath=self.outdir)
if treefile == None:
return
ls = len(smat)
trees.convert_branch_lengths(treefile,os.path.join(self.outdir,'tree.newick'), ls)
print ()
#check unmapped reads
return
[docs]def test_run():
"""Test run"""
testdatadir = 'testing'
out = 'subread_results'
args = {'threads':4, 'outdir': out, 'input': testdatadir,
'species':'Mbovis-AF212297',
'aligner':'subread', 'filters':'QUAL>=40 && DP4>=4',
'reference': None, 'overwrite':True}
W = WorkFlow(**args)
st = W.setup()
if st == True:
W.run()
vdf = tools.vcf_to_dataframe(W.vcf_file)
print (vdf)
return
[docs]def main():
"Run the application"
import sys, os
from argparse import ArgumentParser
parser = ArgumentParser(description='snipgenie CLI tool. https://github.com/dmnfarrell/snipgenie')
parser.add_argument("-i", "--input", action='append', dest="input", default=[],
help="input folder(s)", metavar="FILE")
#parser.add_argument("-l", "--labels", dest="labels", default=[],
# help="sample labels file, optional", metavar="FILE")
parser.add_argument("-e", "--labelsep", dest="labelsep", default='_',
help="symbol to split the sample labels on")
parser.add_argument("-x", "--labelindex", dest="labelindex", default=0,
help="position to extract label in split filenames")
parser.add_argument("-r", "--reference", dest="reference", default=None,
help="reference genome filename", metavar="FILE")
parser.add_argument("-S", "--species", dest="species", default=None,
help="set the species reference genome, overrides -r")
parser.add_argument("-g", "--genbank_file", dest="gb_file", default=None,
help="annotation file, optional", metavar="FILE")
parser.add_argument("-t", "--threads", dest="threads", default=4,
help="cpu threads to use")
parser.add_argument("-w", "--overwrite", dest="overwrite", action="store_true", default=False,
help="overwrite intermediate files")
parser.add_argument("-T", "--trim", dest="trim", action="store_true", default=False,
help="whether to trim fastq files" )
parser.add_argument("-U", "--unmapped", dest="unmapped", action="store_true", default=False,
help="whether to save unmapped reads" )
parser.add_argument("-Q", "--quality", dest="quality", default=25,
help="right trim quality, default 25")
parser.add_argument("-f", "--filters", dest="filters", default=default_filter,
help="variant calling post-filters" )
parser.add_argument("-m", "--mask", dest="mask", default=None,
help="mask regions from a bed file" )
parser.add_argument("-c", "--custom", dest="custom_filters", action="store_true", default=False,
help="apply custom filters" )
parser.add_argument("-p", "--platform", dest="platform", default='illumina',
help="sequencing platform, change to ont if using oxford nanopore")
parser.add_argument("-a", "--aligner", dest="aligner", default='bwa',
help="aligner to use, bwa, subread, bowtie or minimap2")
parser.add_argument("-b", "--buildtree", dest="buildtree", action="store_true", default=False,
help="whether to build a phylogenetic tree, requires RaXML" )
parser.add_argument("-N", "--bootstraps", dest="bootstraps", default=100,
help="number of bootstraps to build tree")
parser.add_argument("-o", "--outdir", dest="outdir",
help="Results folder", metavar="FILE")
#parser.add_argument("-O", "--omit", dest="omit_samples",
# help="List of sample names to omit of required", metavar="FILE")
parser.add_argument("-q", "--qc", dest="qc", action="store_true",
help="Get version")
parser.add_argument("-d", "--dummy", dest="dummy", action="store_true",
default=False, help="Check samples but don't run")
parser.add_argument("-X", "--test", dest="test", action="store_true",
default=False, help="Test run")
parser.add_argument("-v", "--version", dest="version", action="store_true",
help="Get version")
args = vars(parser.parse_args())
check_platform()
if args['test'] == True:
test_run()
elif args['version'] == True:
from . import __version__
print ('snipgenie version %s' %__version__)
print ('https://github.com/dmnfarrell/snipgenie')
elif args['qc'] == True:
print ('Running qc report')
qcfile = 'qc_report.pdf'
filenames = get_files_from_paths(args['input'])
tools.pdf_qc_reports(filenames, qcfile)
elif args['outdir'] == None:
print ('No input or output folders provided. These are required.')
print ('Example:')
print ('snipgenie -r <reference> -i <input folder with fastq.gz files> -o <output folder>')
print ('Use -h for more help on options.')
else:
W = WorkFlow(**args)
st = W.setup()
if st == True and args['dummy'] == False:
W.run()
if __name__ == '__main__':
main()