#!/usr/bin/env python """sniper.py: SNP Identification using Probability of Every Read. Version: 1.0 (unix/mac) Daniel F. Simola (simola@mail.med.upenn.edu) Laboratory of Junhyong Kim University of Pennsylvania 25 January 2011 Copyright (c) 2011, Daniel F. Simola and Junhyong Kim, University of Pennsylvania. All Rights Reserved. You may not use this file except in compliance with the terms of our License. You may obtain a copy of the License at http://kim.bio.upenn.edu/software/ Unless required by applicable law or agreed to in writing, this software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import os, re, sys, math, time, stat, threading, time, random from operator import add,mul from string import * nastr = '"NA"' # represents missing value # LOCAL METHODS # ----------------------------------------------------------------- # ----------------------------------------------------------------- fixNull = lambda nastr: lambda x: x=='' and nastr or x SGr = {} # global current read pileup dictionary # helper function for pileup updating def delete(x): global SGr del SGr[x] return 1 def pipeit(s, nl=0, pipe=sys.stdout): pipe.write(s+(nl==1 and '\n' or '')); sys.stdout.flush() def isnan(x): return str(x) == str(1e400*0) def unique(lst): """Returns a subset of lst containing the unique elements.""" d = {} for i in lst: if i not in d: d[i] = 0 else: d[i] += 1 e = d.keys(); e.sort() return e def maxlenval(tup): mlv = 0 for i in tup: x = len(str(i)) if x>mlv:mlv = x return mlv def fixspacing(item,nsp): v = str(item) if len(v) == nsp: return v delt = nsp-len(v) if delt < 0: v = v[0:nsp] else: for i in range(delt): v+=' ' return v def transpose(mat): """Given a 2d array of arrays, will invert the dimensions.""" newm = [] for c in range(len(mat[0])): newrow = [] for r in range(len(mat)): newrow.append(mat[r][c]) newm.append(newrow) return newm def flatten(sequence): def rflat(seq2): seq = [] for entry in seq2: if seqin([entry]): seq.extend([i for i in entry]) else: seq.append(entry) return seq def seqin(sequence): for i in sequence: ## all sequences have '__contains__' in their dir() ## parentheses present to aid commenting mid-condition if ('__contains__' in dir(i) and type(i) != str and type(i) != dict): return True return False seq = [sequence][:] # in case parameter isn't already a sequence while seqin(seq): seq = rflat(seq) return seq def createdir(dirname): if not dirname: return 0 if not os.access(dirname, os.F_OK): try: os.mkdir(dirname); return 1 except OSError: pass return 0 def record(astr, filename, ioMethod='a'): if ioMethod == 'a': assert os.access(filename, os.F_OK), 'Cannot access '+filename pmrfh = open(filename, ioMethod) print >>pmrfh, astr pmrfh.close() return 1 def trimComments(dat): # parse comments ndat = [] for line in dat: # only care about non commented lines if not re.match('(^\s*#.*$)|(^\s$)', line): # parse any commentary on this line cleanme = re.split('^(.*)#.*$', line) if len(cleanme) > 1: cleanme.pop(0) ndat.append(cleanme[0].strip('\n')) return ndat def getFiles(indirfile, include=[], exclude=[]): if type(include)!=type(list()): include = [include] if type(exclude)!=type(list()): exclude = [exclude] return getContents(indirfile, typ='files', include=include, exclude=exclude) def getDirectories(indirfile, include=[], exclude=[]): if type(include)!=type(list()): include = [include] if type(exclude)!=type(list()): exclude = [exclude] files = getContents(indirfile, typ='dirs', include=include, exclude=exclude) return map(slash, files) def getContents(indirfile, typ='all', include=[], exclude=[]): if type(include)!=type(list()): include = [include] if type(exclude)!=type(list()): exclude = [exclude] keepfiles = [] if len(include) == 0: include = [''] try: mode = os.stat(indirfile)[stat.ST_MODE] INISADIR = False if stat.S_ISDIR(mode): INISADIR = True indirfile = slash(indirfile) files = [] if INISADIR and os.access(indirfile, os.F_OK): for f in os.listdir(indirfile): # print indirfile, 'vs', f # if f[0] != '.' and f[0] != '~' and f[len(f)-3:len(f)] not in exclude: if f[0] != '.' and f[0] != '~': # do includes for item in include: if re.match('.*%s.*'%(item), f): # confirm not a directory mode = os.stat(indirfile+f)[stat.ST_MODE] if typ=='all': if stat.S_ISDIR(mode): files.append(indirfile+f) else: files.append(indirfile+f) elif typ=='files' and not stat.S_ISDIR(mode): files.append(indirfile+f) elif typ=='dirs' and stat.S_ISDIR(mode): files.append(indirfile+f) # else: print indirfile+f, 'is a directory' for f in files: inexclude = False for item in exclude: if re.match('.*%s.*'%(item), f): inexclude = True if inexclude == False: # confirm not a directory keepfiles.append(f) # try: # print 'testing', indirfile+f # mode = os.stat(indirfile+f)[stat.ST_MODE] # print 'mode', mode # except OSError: # print 'why?' # if typ=='all': # if stat.S_ISDIR(mode): keepfiles.append(indirfile+f+'/') # else: keepfiles.append(indirfile+f) # elif typ=='files' and not stat.S_ISDIR(mode): # keepfiles.append(indirfile+f) # elif typ=='dirs' and stat.S_ISDIR(mode): # print 'got a dir' # keepfiles.append(indirfile+f+'/') # else: # print 'failure', stat.S_ISDIR(mode) elif not INISADIR: keepfiles.append(indirfile) else: sys.exit('cannot access dir '+indirfile) except OSError: pass return keepfiles def printList(lst, title="", delim="\n", pipe=sys.stdout, file='', sort=False, newline='\n', ioMethod='w'): if file: pipe = open(file,ioMethod) if sort: lst.sort() if title: lst = [title]+lst#print >> pipe, title for i in range(len(lst)): if i == len(lst)-1: print >> pipe, str(lst[i]), else: print >> pipe, str(lst[i])+delim, if newline: print >> pipe, newline, if file: pipe.close() def readTable(filename, header=True, rownames=True, delim="\t", comments=True, keepKey=False): fh = open(filename) table = fh.readlines() fh.close() if comments: table = trimComments(table) tabHeader = []; rowIDs = []; data = [] if header: tabHeader = table.pop(0).strip('\n').split(delim) if not keepKey: tabHeader.pop(0) # else: tabHeader = table.pop(0).strip('\n').split(delim) for line in table: sline = line.strip('\n').split(delim) if rownames: rowIDs.append(sline.pop(0)) data += [sline] return data, rowIDs, tabHeader def printTable(tab, header=[], rows=[], delim="\t", newline='\n', file='', pipe=sys.stdout, ioMethod='w'): if file: pipe = open(file,ioMethod) if header: printList(header, delim=delim, pipe=pipe, newline=newline) if len(rows): assert len(rows) == len(tab), "Rows and tab must have same length." for ri in range(len(tab)): r = [rows[ri]]+tab[ri] printList(r, delim=delim, pipe=pipe, newline=newline) else: for r in tab: printList(r, delim=delim, pipe=pipe, newline=newline) if file: pipe.close() def printFormattedTable(tab, header=[], rows=[], delim=" ", newline='\n', file='', pipe=sys.stdout, nastr='"NA"', ioMethod='w', colSpacing=[]): if file: pipe = open(file,ioMethod) if tab == [] and file: pipe.close(); return tab2 = tab[:] if rows: for i in range(len(tab)): tab2[i] = [rows[i]]+tab[i] # need to determine for each column the max length value and space to that collens = colSpacing[:] if not len(collens): # print 'printformattedtable: fixing col spacing' temp = transpose(tab2) if len(header): for i in range(len(temp)): r = temp[i] r.append(header[i]) collens.append(maxlenval(r)) else: for r in temp: collens.append(maxlenval(r)) # now create a new tuple with proper spacing temp = []; sheadings = [] if len(header): for i in range(len(header)): sheadings.append(fixspacing(header[i],collens[i])) for row in tab2: newrow = [] for i in range(len(row)): val = row[i]==nastr and '.' or row[i] #if val==nastr: val = '.' # SAS missing value newrow.append(fixspacing(val,collens[i])) temp.append(newrow) # pipe = open(file,ioMethod) printTable(temp,header=sheadings,pipe=pipe,delim=delim,newline=newline) if file: pipe.close() def nts2iupac(nts): X = {'A':'A', 'T':'T', 'C':'C', 'G':'G', 'N':'N', 'U':'U', '-':'-'} X['GT'] = 'K' X['AC'] = 'M' X['CGT'] = 'B' X['ACG'] = 'V' X['CG'] = 'S' X['AT'] = 'W' X['AGT'] = 'D' X['CT'] = 'Y' X['AG'] = 'R' X['ACT'] = 'H' X['AA'] = 'A' X['TT'] = 'T' X['CC'] = 'C' X['GG'] = 'G' nts = ''.join(sorted(map(lambda x: x.upper(), nts))) return X[nts] def iupac2nts(a,double=True): X = {'A':'A', 'T':'T', 'C':'C', 'G':'G', 'N':'N', 'U':'U', '-':'-'} X['K'] = 'GT' X['M'] = 'AC' X['B'] = 'CGT' X['V'] = 'ACG' X['S'] = 'CG' X['W'] = 'AT' X['D'] = 'AGT' X['Y'] = 'CT' X['R'] = 'AG' X['H'] = 'ACT' if double: X['A'] = 'AA' X['T'] = 'TT' X['C'] = 'CC' X['G'] = 'GG' return X[a.upper()] def readFasta(faname, usealtid=False, split='^>', idPat='', addins=[], TOUPPER=False, VERBOSE=False): """Parses a multiple fasta file into a dictionary object keyed by fasta identifier. Option to key the dictionary by an alternative id found in the fasta header: altid=True|False. If directory of several fasta files is provided as faname, loads them into single dictionary. """ fadict = {} # return dictionary of fasta entries faj = '' # load all information into single data string, faj files = getFiles(faname) for f in files: if VERBOSE: print '- Loading %s'%(f) fh = open(f) fa = fh.readlines() fh.close() # remove comment lines faj += ''.join(filter(lambda x: not re.match(' *#.*', x) and True, fa)) # parse by entry (>) getentries = re.compile(split, re.MULTILINE) faentries = re.split(getentries, faj) # for some reason the first entry is the null string faentries.pop(0) # parse individual entries for entry in faentries: # first has format >name other-header-info (header, seq) = entry.split("\n", 1) # trim off the '>' character theid = "" altid = "" info = "" # further split up the info - for use of alternative identifier pattern = '^(\S+) (.*)$' if re.match(pattern, header): (crap, theid, info, crap) = re.split(pattern, header) info = info.strip(' ').strip('\t') if len(info) > 0: #print "\""+info+"\"" (crap, altid, info, crap) = re.split('(\S+)(.*)', info) info = info.strip(' ').strip('\t') else: theid = header altid = "" # remove newlines so the sequence data is contiguous seq = re.sub("\n", "", seq) # remove terminal whitespace seq = seq.strip(' ') if idPat != '': crap, theid, crap = re.split(idPat, theid) #print 'theid', theid for pre,suf,pat in addins: if re.match(pat, theid): # print 'theid', theid, 'pat', pat theid = pre+theid+suf # print 'new', theid if TOUPPER: seq = seq.upper() # build the entry # add a little check to see if there are repeat entries if fadict.has_key(theid): print theid, ": This key has already been used to store another fasta entry!" else: # build the entry for this id # allow choice of primary dict key if theid and altid and usealtid: #print "OK", theid, altid fadict[altid] = {'id':theid, 'altid':altid, 'info': info, 'seq': seq} else: fadict[theid] = {'id':theid, 'altid':altid, 'info': info, 'seq': seq} return fadict def list2fasta(seqs=[], ids=[]): dct = dict() uid = 0 for i in range(len(seqs)): theid = str(uid) try: theid = ids[i] except IndexError: pass dct[theid] = {'id':theid, 'seq':seqs[i], 'info':''} uid += 1 return dct def printFasta(fadict, file='', pipe=sys.stdout, key='seq', usealtid=False): """Print a dictionary of fasta entries to a file in mfasta format""" keys = fadict.keys() keys.sort() if file: pipe = open(file,'w') for theid in keys: if 'altid' in fadict[theid] and not usealtid: header = '>'+theid+' '+fadict[theid]['altid']+' '+fadict[theid]['info'] elif 'id' in fadict[theid]: header = '>'+theid+' '+fadict[theid]['id']+' '+fadict[theid]['info'] else: header = '>'+theid+' '+fadict[theid]['info'] print >> pipe, header print >> pipe, fadict[theid][key] print >> pipe if file: pipe.close() def fasta2Phylip(dct={}, file='phylipi.txt', infile='', verbose=False): """Convert a fasta file into Phylip interleaved format.""" # print infile d = dct if infile: d = readFasta(infile) names = d.keys() # fix length maxlen = 10+1 shorts = ['' for i in range(len(names))] for i in range(len(names)): shorts[i] = fixspacing(names[i], maxlen) blank = fixspacing('',maxlen) taxa = len(names) # print d # print 'test', d[d.keys()[0]] alilen = len(d[d.keys()[0]]['seq']) seqs = ['' for i in range(taxa)] if verbose: print shorts if verbose: print taxa, alilen if verbose: print map(len, seqs) enc = 'ascii' for i in range(taxa): seqs[i] = d[names[i]]['seq'].encode(enc) # print out an interleaved phylip formatted file pos = 0 fh = open(file, 'w') # alilen = 715 fh.write(str(taxa)+' '+str(alilen)+'\n'.encode(enc)) for i in range(taxa): fh.write(shorts[i]+seqs[i][pos:pos+maxlen]+' '+seqs[i][pos+maxlen:pos+2*maxlen]+' '+seqs[i][pos+2*maxlen:pos+3*maxlen]+' '+seqs[i][pos+3*maxlen:pos+4*maxlen]+'\n'.encode(enc)) pos = 2*maxlen fh.write('\n'.encode(enc)) while pos < alilen: for i in range(taxa): fh.write(blank+seqs[i][pos:pos+maxlen]+' '+seqs[i][pos+maxlen:pos+2*maxlen]+' '+seqs[i][pos+2*maxlen:pos+3*maxlen]+' '+seqs[i][pos+3*maxlen:pos+4*maxlen]+'\n'.encode(enc)) pos += 4*maxlen fh.write('\n'.encode(enc)) fh.close() return 1 def argmax(v): """Return argmax of a list""" mi = 0 mv = v[0] for i in range(1,len(v)): if float(v[i]) > float(mv): mv = float(v[i]) mi = i return mi def slash(astr): if not len(astr): return '/' elif astr[-1] !='/': astr+='/' elif astr[len(astr)-2:len(astr)] == '//': astr = astr[:-1] return astr def makeContigs(intervals): intervals.sort() # print intervals[0:100] # combine overlaps coverage = 0 # total bases mapped contigs = [] curr = 1 while curr < len(intervals): bak = curr start = intervals[curr-1][0]; stop = intervals[curr-1][1] while curr < len(intervals) and intervals[curr][0] <= stop: if intervals[curr][1] > stop: stop = intervals[curr][1] curr += 1 contigs += [(start,stop)] coverage += abs(start-stop) curr += 1 # print 'contigs', contigs return {'contigs':contigs, 'coverage':coverage} eq = lambda s: lambda i: str(s)!=str(i) filterstr = lambda s: lambda v: filter(eq(s), v) roundna = lambda e: lambda x: ((x==nastr or e==nastr) and nastr) or round(float(x),int(e)) floatna = lambda x: x == nastr and nastr or float(x) intna = lambda x: (x == nastr or x == '' or x == 'NA' and nastr) or int(x) def factit(x): """Iterative implementation of factorial function.""" if x <= 1: return 1 a = [0 for i in range(x+1)] a[0] = 1; a[1] = 1 for i in range(2,x+1): a[i] = i*a[i-1] return a[x] nchoosek = lambda n,k: factit(n)/(factit(k)*factit(n-k)) # binomial coefficient prod = lambda lst: reduce(mul, lst) # product function # basic statistics def avg(lst): try: return reduce(add, lst)/float(len(lst)) except TypeError: return nastr def harmonicMean(v, minval=1e-20): if len(v) == 0: return 0 # prevent 0 division v = map(lambda x: x==0 and minval or x, v) return len(v) / sum(map(lambda x: 1./x, v)) ssd = lambda lst, mu: reduce(add, map(lambda x: (x-mu)*(x-mu), lst)) def variance(lst): try: return divna(ssd(lst,avg(lst)), float((len(lst)-1))) except TypeError: return nastr def sd(lst): try: return math.sqrt(variance(lst)) except TypeError: return nastr def percentile(v=[], k=0.5): """ Return the value of array that corresponds to the kth percentile of the data. """ if len(v) <= 0: return 0.0 temp = map(float, v) temp.sort() n = len(temp) idx = float(k)*(n+1) lidx = math.floor(idx) hidx = math.ceil(idx) if idx < 1.0: return float(temp[0]) elif idx > n-1: return float(temp[n-1]) elif not (idx == lidx or idx == hidx): return float(temp[int(lidx)] + temp[int(hidx)])/2.0 else: return float(temp[int(idx)]) median = lambda v: percentile(v,.5) getQuants = lambda dat: ','.join([str(q)+':'+str(round(percentile(dat, q),4)) for q in [0.05, 0.25, 0.5, 0.75, 0.95]]) # list operations head = lambda x: x[0] tail = lambda x: x[len(x)-1] car = lambda x: x[0] cdr = lambda x: x[len(x)-1] # arithmetic operations with missing values subna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) - float(v) multna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) * float(v) divna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) / float(v) roundna = lambda e: lambda x: ((x==nastr or e==nastr) and nastr) or round(float(x),int(e)) def makeRange(xmin,xmax,interval=None,n=None): if n == None: n = int(abs(xmax-xmin)/float(interval)) else: interval = (xmax-xmin)/float(n) interval += interval/float(n-1) return [xmin+interval*i for i in range(n)] def histogram(dat, nbins=100, bins=[], categorical=False, yfreq=False, logscale=False, logbase=10, shift=0, dtype=None): import math if logscale: dat = map(lambda x: math.log(x,logbase), dat) if len(bins) > 0: # print 'problem?' nbins = len(bins) givenbins = True else: givenbins = False # need to create a histogram filt = filterstr(nastr)(dat) if dtype: filt = map(dtype, filterstr(nastr)(dat)) dat = None if not len(filt): return [[0,0]] counts = [0 for i in range(nbins)] if not givenbins and not categorical: bins = makeRange(min(filt), max(filt), n=nbins) elif not givenbins: bins = unique(filt) # minor bin correction for plotting if shift != 0: bins = map(lambda x: x+shift, bins) # this is like O(N2) too slow if not categorical: for j in range(len(filt)): k = 0 # print j, len(filt) while k < nbins and float(filt[j]) > bins[k]: k +=1 if k == nbins: k -= 1 counts[k] += 1 else: bindct = {} for b in bins: bindct[b] = 0 bk = bindct.keys() for j in range(len(filt)): bindct[filt[j]] += 1 # convert back to a histogram bins = []; counts = [] for key in bindct.keys(): bins += [key]; counts += [bindct[key]] tmp = [] intnbins = int(nbins) # counts or frequency? if yfreq: tot = float(sum(counts)) if logscale: tmp = [[logbase**bins[k],counts[k]/tot] for k in range(intnbins)] else: tmp = [[bins[k],counts[k]/tot] for k in range(intnbins)] if logscale: tmp = [[logbase**bins[k],counts[k]] for k in range(intnbins)] else: tmp = zip(bins,counts) if categorical: tmp = sorted(tmp) return tmp # dictionary keyed by ascii character returning phred q-score # ascii 33 to 126 - full range fastqchars = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\{', '\|', '\}', '~'] A2Q = dict(zip(fastqchars, range(len(fastqchars)))) # string conversion to integer (potentially a long) encodeNt = lambda x: x=='A'and'1' or x=='T'and'2' or x=='C'and'3' or x=='G'and'4' or x=='N'and'5' encodeSeq = lambda seq: int(''.join(map(encodeNt, seq))) decodeNt = lambda x: x=='1'and'A' or x=='2'and'T' or x=='3'and'C' or x=='4'and'G' or x=='5'and'N' decodeSeq = lambda val: ''.join(map(decodeNt, str(val))) encodeQ = dict(zip(fastqchars, map(str,range(10,len(fastqchars)+10)))) # every character is 2 integer digits decodeQ = dict(zip(map(str,range(10,len(fastqchars)+10)), fastqchars)) encodeQual = lambda qual: int(''.join(map(lambda x: encodeQ[x], qual))) pairUP = lambda qual: map(lambda x: qual[x-1]+qual[x] ,range(1,len(qual),2)) decodeQual = lambda qual: ''.join(map( lambda x: decodeQ[x], pairUP(str(qual)) )) # the core procedure def sniper(readfiles, GR, outfile='', saveall='', tmpdir='', expPhred=30, maxreadlen=150, Lambda=.5, prior='resequence', theta=0.001, verbose=False, mindepth=1, bounds={}, ploidy=2, LIPieces=25): """Bayesian genome SNP calling utilizing multiply aligned reads bounded by k mismatches. """ # count total genomic positions flg = sum(map(lambda x: len(x['seq']), GR.values())) # store length of each chromosome/scaffold GRlengths = dict( map(lambda x: (x, len(GR[x]['seq'])), GR.iterkeys()) ) # Internal methods # --------------------------------------------------------------- ln = lambda x: math.log(x, math.e) log = lambda x: math.log(x, 10) lowestQ = int(-10*log(1e-16)) # convert between phred score and p-value Q2P = lambda e: (e == 0 and 1-1e-16) or 10**(-e/10.) # avoid 1 P2Q = lambda p: -10*log(p) A2P = dict( [ascii, 1-Q2P(A2Q[ascii])] for ascii in A2Q.keys() ) # Probability of no error invA2P = dict( [ascii, Q2P(A2Q[ascii])] for ascii in A2Q.keys() ) # Probability of error # Computes mismatches between 2 sequences derror = lambda a,b: (a == b and 0) or sum(map(lambda x,y: x != y and 1 or 0, a,b)) def intervalWithinRange(baseint, queryint): """Returns 0,1,2,3 according to these rules: 0: queryint is not within base int 1: queryint is within base int 2: queryint spans the beginning of baseint 3: queryint spans the end of baseint baseint/queryint need not already be sorted ie, [1000, 2000] and [2000,1000] will return same value """ # assumes sorted coordinates - slightly faster # baseint = sorted(baseint); queryint = sorted(queryint) # if the end of the element comes before the start of the base if (queryint[1] < baseint[0] or baseint[1] < queryint[0]): return 0 # if element lies inside the base region elif (queryint[0] >= baseint[0] and queryint[1] <= baseint[1]): return 1 # if we span a region elif (queryint[0] < baseint[0] and queryint[1] >= baseint[0]): return 2 else: return 3 def overlaps(tt,q): return sum([intervalWithinRange(t,q) for t in tt]) def loadUniqueReads(tmpfile, fromChromosomes, filterids={}, bounds={}): # initialize D with chrom names D = dict( map(lambda x: (x,{}), fromChromosomes) ) Bb = {} # set up dictionary based on bounds for k in bounds.keys(): for low,high,flag in sorted(bounds[k]): if flag == '+': try: Bb[k] += [(low,high)] except KeyError: Bb[k] = [(low,high)] if not os.access(tmpfile, os.F_OK): return D fh = open(tmpfile) count = 1 for line in fh: if count % 1000000 == 0: sys.stdout.write('%s,'%(count)); sys.stdout.flush() count+=1 try: lst = line[:-1].split('\t') chrom = lst[2] pos = int(lst[3])-1 if chrom == '*' or (chrom not in D) or lst[0] in filterids: continue elif chrom in Bb and overlaps(Bb[chrom], (pos,pos+len(lst[10]))) == 0: continue # collapse reads by position, so that each list index contains all reads which start at this position # = (pos, seq, qual, hitid) # stuff = (pos, lst[9], lst[10], lst[0]) # stuff = (pos, lst[9], lst[10], hash_float(lst[0])) # stuff = (pos, lst[9], lst[10], hash(lst[0]), len(lst[9])) # NEW! added len(seq) # compress seq and qual here too stuff = (pos, encodeSeq(lst[9]), encodeQual(lst[10]), hash(lst[0]), len(lst[9])) # NEW! added len(seq) if pos in D[chrom]: D[chrom][pos] += [stuff] else: D[chrom][pos] = [stuff] # exception if a line beginning with @ except IndexError: pass except ValueError: pass fh.close() return D # map(lambda pos: pos in D and D[pos] or [], range(totallen)) def loadDegReads(readfiles, chromosomesOfInterest, bounds={}): """Load the degenerate portion of the read map for any reads aligning to this chromosome.""" # convert to dict chint = dict( map(lambda x: (x,None), chromosomesOfInterest) ) Bb = {} # set up dictionary based on bounds for k in bounds.keys(): for low,high,flag in sorted(bounds[k]): if flag == '+': try: Bb[k] += [(low,high)] except KeyError: Bb[k] = [(low,high)] # gather reads of interest wrt this chrom group readsOfInterest = {} pipeit(' - Scanning for degenerate reads of interest...') for pese in ['PE', 'SE']: degfile = readfiles[pese+'deg'] if degfile: fh = open(degfile) count = 1 for line in fh: if count % 1000000 == 0: pipeit('%s,'%(count)) count+=1 if line[0] == '@': continue lst = line[:-1].split('\t') chrom = lst[2] pos = int(lst[3])-1 # only keep a read if it has an alignment on these chromosomes/bounds if chrom in chint: readsOfInterest[lst[0]] = None elif chrom in Bb and overlaps(Bb[chrom], (pos,pos+len(lst[10]))) > 0: readsOfInterest[lst[0]] = None fh.close() pipeit('Done',1) # load all degenerate reads # ------------------------- RdxChr = {'PE':{}, 'SE':{}} # chrKeep = {} for chrom in GR.keys(): # needs all chrom keys # chrKeep[chrom] = None RdxChr['PE'][chrom] = {} RdxChr['SE'][chrom] = {} otherhits = {} OHI = {} # we actually need 2 passes through this file # first, fetch all reads aligned to the chromosomes of interest # then collect all reads with that name for pese in ['PE', 'SE']: degfile = readfiles[pese+'deg'] if degfile: pipeit(' - Loading degenerate %s reads...'%(pese)) fh = open(degfile) count = 1 for line in fh: if count % 1000000 == 0: pipeit('%s,'%(count)) count+=1 try: lst = line[:-1].split('\t') chrom = lst[2] if chrom == '*' or lst[0] not in readsOfInterest: continue hitid = hash(lst[0]); pos=int(lst[3])-1; seq=lst[9]; qual=lst[10] # only add this read if we haven't already recorded it # our read maps duplicate both ends of a PE read even if # only one of the mate pairs has multiple alignments # so we need to eliminate these duplicate entries # making OHI a necessity # the complete information we need for SNP calling stuff = (pos, encodeSeq(seq), encodeQual(qual), hitid, len(seq)) # if initial map file has large d, user can reduce memory usage as desired by capping # the number of alignments loaded into memory here loh = 0 try: loh = len(otherhits[hitid]) except KeyError: pass # make sure this isn't a duplicate entry in the map file # by checking location of this alignment (see comment above) if hitid in OHI and (chrom,pos) not in OHI[hitid] and loh < maxnumalign: otherhits[hitid] += [(stuff,chrom)] OHI[hitid][(chrom,pos)] = None # and add the info to the non-unique read pileup if pos in RdxChr[pese][chrom]: RdxChr[pese][chrom][pos] += [stuff] else: RdxChr[pese][chrom][pos] = [stuff] elif hitid not in OHI and loh < maxnumalign: otherhits[hitid] = [(stuff,chrom)] OHI[hitid] = {(chrom,pos):None} # and add the info to the non-unique read pileup if pos in RdxChr[pese][chrom]: RdxChr[pese][chrom][pos] += [stuff] else: RdxChr[pese][chrom][pos] = [stuff] # exceptions if line begins with @ except IndexError: continue except ValueError: continue fh.close() pipeit('Done.',1) del OHI # no longer needed return RdxChr, otherhits # ========================================================================= # Initialize local variables # ========================================================================= if prior == 'haploid': ploidy = 1 if ploidy == 1: prior = 'haploid' nucleotides = ['A', 'T', 'C', 'G'] # 10 possible genotypes that we can identify genotypes = None genoUni = None if ploidy == 2: genotypes = [('A','A'),('A','T'),('A','C'),('A','G'),('T','T'),('T','C'),('T','G'),('C','C'),('C','G'),('G','G')] genoUni = {('A','A'):'AA',('A','T'):'AT',('A','C'):'AC',('A','G'):'AG', ('T','T'):'TT',('T','C'):'CT',('T','G'):'GT',('C','C'):'CC', ('C','G'):'CG',('G','G'):'GG'} else: genotypes = ['A', 'T', 'C', 'G'] genoUni = {'A':'A', 'T':'T', 'C':'C', 'G':'G'} rangelengenotypes = range(len(genotypes)) GT = genotypes # alias invGT = None if ploidy == 2: invGT = dict( map(lambda i: ((GT[i][0],GT[i][1]),i) , rangelengenotypes) ) invGT2 = dict( map(lambda i: ((GT[i][1],GT[i][0]),i) , rangelengenotypes) ) invGT.update(invGT2) # ========================================================================= # Parameters # ========================================================================= # P(G_xy(l)) = pr. drawing genome at position l with genotype xy # -------------------------------------------------------------- # p = log(1/10.) # PG = [p for i in range(lengenotypes)] # uniform prior # 0 = homozygous reference, 1 = het, 2 = homozygous derived if prior == 'resequence': # heterozygous different is rarest # h0, t1, h2, t2, where P(h2) = P(t1) and P(t2) == P(t1)^2 PGvals = [1-theta-theta**2, theta/2., theta/2., theta**2] elif prior == 'resequence-het': # h0, t1, h2, t2, where P(h2) = P(t2) PGvals = [1-theta-theta**2, theta, (theta**2)/2., (theta**2)/2.] elif prior == 'maq': # Durbin/maq model: Pr{XX} = Pr{YY} > Pr{XY} # h0, t1, h2, t2 PGvals = [(1-theta-theta**2)/2., theta, (1-theta-theta**2)/2., theta**2] elif prior == 'uniform': PGvals = [.25,.25,.25,.25] elif prior == 'haploid': # h0, t1, h2, t2, where P(t1) and P(t2) = 0 PGvals = [1-theta, theta] else: sys.exit('snpper: Do not understand prior argument: %s'%(prior)) PGlamb = None def PGlamb(i,ref): if genotypes[i][0]==ref and genotypes[i][1]==ref: return PGvals[0] elif (genotypes[i][0]==ref and genotypes[i][1]!=ref) or (genotypes[i][0]!=ref and genotypes[i][1]==ref): return PGvals[1] elif genotypes[i][0]==genotypes[i][1] and genotypes[i][0]!=ref: return PGvals[2] elif genotypes[i][0]!=genotypes[i][1] and genotypes[i][0]!=ref and genotypes[i][1]!=ref: return PGvals[3] if ploidy == 1: # maps a genotype given reference to 0,1 = same as ref, or different def PGlamb(i,ref): if genotypes[i]==ref: return PGvals[0] else: return PGvals[1] # make into a lookup dictionary PG = {} for i in rangelengenotypes: for nt in nucleotides: PG[i,nt] = PGlamb(i,nt) thetastr = ', Theta = %s'%(theta) if prior == 'uniform': thetastr = '' print ' - Using global prior probabilities: Prior model = %s%s'%(prior, thetastr) # ========================================================================= # Precompute nt probability scores # ========================================================================= # PriS = pr. sequencing read ri from template sequence S # -------------------------------------------------------------- maxphred = 40 # generalize this for all read lengths - 0 to 200 binCoef = [[nchoosek(rl,d) for d in range(rl+1)] for rl in xrange(maxreadlen+1)] # call: binomialP(readlength)(mismatches, Perror) # tune between read/position-specific error vs a global/position specific error binomialP = lambda rl: lambda d,p: binCoef[rl][d] * p**d * (1-p)**(rl-d) expError = 10**(-expPhred/10.) PriSbin = [[Lambda * binomialP(rl)(d, expError) for d in range(rl+1)] for rl in xrange(maxreadlen+1)] # index by [rl][offset] offsetRange = [map(lambda offset: range(offset)+range(offset+1, rl), xrange(maxreadlen+1)) for rl in xrange(maxreadlen+1)] ntPriS = None opLambda = 1. - Lambda # binomial + quality model if Lambda < 1.0: print ' - Using tuned binomial+quality likelihood model with lambda=%s, global error=%s'%(Lambda, expPhred) def ntPriS(S, R, Q, offset, rl): # localize variables for speed oR = offsetRange Pb = PriSbin lA2P = A2P linvA2P = invA2P nucs = nucleotides d = 0 # number of mismatches in the alignment p = 1 # read/position specific likelihood # avoid the offset position for i in oR[rl][offset]: p *= (S[i] == R[i] and lA2P[Q[i]]) or linvA2P[Q[i]] d += int(S[i] != R[i]) # complete with offset position return dict( map(lambda nt: (nt, Pb[rl][d+int(S[offset]!=nt)] + opLambda*p*(S[offset]==nt and lA2P[Q[offset]] or linvA2P[Q[offset]])), nucs) ) else: print ' - Using binomial only likelihood model with global error=%s'%(expPhred) # binomial only model when lambda = 1 # this will be faster def ntPriS(S, R, Q, offset, rl): # localize variables for speed oR = offsetRange Pb = PriSbin nucs = nucleotides d = 0 # number of mismatches in the alignment # avoid the offset position # for i in range(offset)+range(offset+1,rl): for i in oR[rl][offset]: d += int(S[i] != R[i]) # complete with offset position return dict( map(lambda nt: (nt, Pb[rl][d+int(S[offset]!=nt)]), nucs) ) # DONE CHOOSING METHOD # PriSj = pr. that Sj was the template out of all SGri templates # -------------------------------------------------------------- maximumcoverage = 100001 # this will crash if any locus has greater read depth PriSj = [nastr]+map(lambda n: 1/float(n), range(1,maximumcoverage)) # uniform, precomputed # indicator variable that returns 1 if genotype is homozygous and 2 if heterozygous # used because we are only considering the 10 unique genotypes I2xHet = {} if ploidy == 1: for X in genotypes: I2xHet[X] = 1. else: for X,Y in genotypes: if X==Y: I2xHet[(X,Y)] = 1. else: I2xHet[(X,Y)] = 2. # control plot for phred to Pvalue transformation # lst = [1-1/10**(x/10.) for x in range(34) # plot.scatter(zip(range(34),lst), style='linespoints', xlabel='phred', ylabel='1-error rate') # Set up bounds for subset of genome to process # ========================================================================= GRNA = {} if len(bounds.keys()): flg = 0 print ' - We have genotyping bounds...' null = [nastr, 'NA'] for tmpk in GR.keys(): master = GR[tmpk]['seq'][:] SEEN = False for k in [tmpk, tmpk.lower(), tmpk.upper()]: if k in bounds and not SEEN: SEEN = True lastpos = 0 # may be many positions of interest for low,high,flag in sorted(bounds[k]): if flag == '+': # remove positions not specified in a range if low not in null and high not in null: # we have a range tmps = master tmp = ''.join(map(lambda x: 'N', tmps[0:low])) tmp += tmps[low:high+1] tmp += ''.join(map(lambda x: 'N', tmps[high+1:len(tmps)])) master = tmp elif low not in null: # we have a single position of interest # since we've sorted these positions, # erase between lastpos and low tmp = master[0:lastpos+1] tmp += ''.join(map(lambda x: 'N', master[lastpos+1:low])) tmp += master[low] tmp += master[low+1:len(master)] master = tmp lastpos = low # else just leave the entire chromosome intact elif flag == '-': # set ranges to NA if low not in null and high not in null: tmps = master tmp = tmps[0:low] tmp += ''.join(map(lambda x: 'N', tmps[low:high+1])) tmp += tmps[high+1:len(tmps)] master = tmp # now I can finish erasing the end if lastpos: master = master[0:lastpos] + ''.join(map(lambda x: 'N', master[lastpos:len(master)])) # write back GRNA[tmpk] = {'id':tmpk, 'seq':master} flg += len(master) else: GRNA = GR # ========================================================================= # Initialize variables for likelihood computation # ========================================================================= totalLoci = 0 # count number of reference positions passed numHypotheses = 0 # count up number of hypotheses we make outfh = None if outfile: outfh = open(outfile, 'w') alloutfh = None if saveall: alloutfh = open(saveall, 'w') # blank template list for likelihood calculations below (copy from this) LLg0 = [0 for i in genotypes] # if doing likelihood sums # now proceed to genotype each reference genomic position chromosomesOfInterest = sorted(GRNA.keys()) nchroms = len(chromosomesOfInterest) # split chromosomes into LIPieces loading groups chromgroups = [] copydir = chromosomesOfInterest[:] if LIPieces > nchroms: LIPieces = nchroms itemspergroup = math.ceil(nchroms / float(LIPieces)) while len(copydir): currgrp = [] while len(currgrp) < itemspergroup and len(copydir): currgrp += [copydir.pop(0)] chromgroups += [currgrp] globRu = {'PE':{}, 'SE':{}} RdxChr = {}; otherhits = {} lipct = 0 for chromosomesOfInterest in chromgroups: lipct += 1 # load the relevant reads for this chromosome group # ------------------------------------------------- PESEs = ['SE', 'PE'] if not USESINGLETONS: PESEs = ['PE'] # unique reads for pese in PESEs: pipeit(' - Loading unique %s reads | loading group %s/%s...'%(pese, lipct, LIPieces)) globRu[pese] = loadUniqueReads(readfiles[pese+'uni'], chromosomesOfInterest, bounds=bounds) pipeit('Done.',1) # degenerate reads RdxChr, otherhits = loadDegReads(readfiles, chromosomesOfInterest, bounds=bounds) # process the reads for chrom in chromosomesOfInterest: GRseq = GR[chrom]['seq'] # want actual sequence for proper derror calculations chromlen = len(GRseq) # clear read pileup and marked list every new chromosome SGr.clear() # stores read pileup - this is a global variable prevGi = -maxreadlen # remember the last position Gi we visited # ITERATE OVER REFERENCE GENOMIC POSITIONS # ---------------------------------------- for Gi in xrange(chromlen): totalLoci += 1 # is this position masked due to user-specified bounds? if GRNA[chrom]['seq'][Gi] == 'N': continue # can't resequence without a reference ref = GRseq[Gi] # actual nucleotide in reference genome at position Gi if Gi % 10000 == 0: pipeit(' %s %s/%s = %s (Hypos=%s | %.2f'%(chrom, Gi,chromlen, ref, numHypotheses, 100*(totalLoci/float(flg)))+'% complete)',1) # GET READ PILEUP FOR THIS POSITION # --------------------------------------------------------------- # all I really need to do here is delete entries that no longer overlap and add new ones # add entries from new positions that do overlap for pos in xrange(max(0, (Gi-maxreadlen)+1, prevGi+1), min(Gi+1, chromlen)): # having group-indexed the reads by starting position we can do this # each key of SGr is the read id, returning a list of pos,seq,phred triples # FETCH UNIQUE READS OVERLAPPING THIS POSITION for pese in PESEs: if pos not in globRu[pese][chrom]: continue # iterate over reads starting at this position for (epos, eseq, equal, ehitid, esrl) in globRu[pese][chrom][pos]: SGr[ehitid] = [(epos, decodeSeq(eseq), decodeQual(equal), esrl)] # FETCH DEGENERATE READS OVERLAPPING THIS POSITION for pese in PESEs: if pos not in RdxChr[pese][chrom]: continue # iterate over reads starting at this position for (epos, eseq, equal, ehitid, esrl) in RdxChr[pese][chrom][pos]: SGr[ehitid] = [(epos, decodeSeq(eseq), decodeQual(equal), esrl)] # add conditional probs for other alignments for (opos, oseq, oqual, ohitid, osrl),ochrom in otherhits[ehitid]: SGr[ehitid] += [(opos, ochrom, decodeSeq(oseq), decodeQual(oqual))] # update previous position we visited prevGi = Gi # --------------------------------------------------------------- # DONE OBTAINING PILEUP FOR THIS POSITION # ASSESS THE GENOTYPE AT THIS POSITION # ------------------------------------ # Compute product likelihood of read pileup across genotypes LL = LLg0[:] # read-sum log likelihoods AF = {'A':0, 'T':0, 'C':0, 'G':0, 'N':0} # allele frequency distribution multiplicity = 0 # number of reads that are degenerate (have >1 alignment; some k < n) totalmultiplicity = [] # total number of degenerate alignments (d-n) # mmpr = [] # number of alignment mismatches per read marked = [] # list of readids marked to be deleted for readid,alignments in SGr.iteritems(): # if we don't overlap, then discard and continue # first alignment overlaps current position on chrom (pos,seq,qual,curRL) = alignments[0] offset = Gi-pos # what is nt of interest wrt this read? # curRL = maxreadlen # fixed length reads # curRL = len(seq) # this read's read length # then this read doesn't actually overlap current locus if offset >= curRL: marked += [readid]; continue # need to know the size of SG(l), the k-bounded string set # for the resequenced genome SG at position l # that is, total number of degenerate alignments + alignment at this position numTemplates = len(alignments) # = d+1, where d is multiplicity for this read AF[seq[offset]] += 1 # update allele counts # now have d alignments, which are all degenerate multiplicity += (numTemplates>1 and 1) or 0 # are there any degenerate alignments? totalmultiplicity += [numTemplates-1] # add d = len(alignments) # likelihood = (1/d+1)P(ri|Gl=x) + (1/d+1)Pdeg1 + ... + (1/d+1)Pdegd # = (1/d+1)[P(ri|Gl=x) + Pdeg1 + ... + Pdegd] # = (1/d+1)[P(ri|Gl=x) + sum_d(Pdeg)] # ~ (1/d+1)P(ri|Gl=x) + (1/d+1)d x Pdeg # so we are just computing a weighted average # now the 1/d+1 term is the probability of one genomic segment # serving as the template for sequencing: P(s | G) # here this is uniformly distributed across reads # below is an implementation where P(s | G) ~ P(s | ri, G) # where we solve in a Bayesian fashion: # P(s | ri, G) = P(ri | s, G)P(s,G) / Sum P(ri | s, G)P(s,G) # with uniform prior this reduces to # P(s | ri, G) = P(ri | s, G) / Sum P(ri | s, G) # which is a weighting based on relative strength of the read aligning to # that genomic region # basically this says that for degenerate reads, give the most weight # to the location with fewer mismatches or higher quality # if ploidy = 2, then really have 2d + 2 alignments for this read Pcond = ploidy == 1 and PriSj[numTemplates] or PriSj[2*numTemplates] # Note, we could formulate Pcond in a more read-specific manner... # compute the likelihood for this read given the offset position Prl = ntPriS(seq,GRseq[pos:pos+curRL],qual,offset,curRL) # sum up the degenerate probability mass: info = (opos, ochrom, seq, oqual) # consider variant reference genotypes at degenerate positions - same offset as reference # Pdeg = sum( map(lambda info: max(ntPriS(info[2],GR[info[1]]['seq'][info[0]:info[0]+len(info[2])],info[3],offset).values()), alignments[1:]) )*Pcond Pdeg = sum( map(lambda info: max(ntPriS(info[2],GR[info[1]]['seq'][info[0]:info[0]+curRL],info[3],offset,curRL).values()), alignments[1:]) )*Pcond twoPdeg = 2*Pdeg # saves on computation time LL = ploidy == 1 and map(lambda g: LL[g] + log(Pcond*Prl[GT[g]] + Pdeg), rangelengenotypes) or \ map(lambda g: LL[g] + log(Pcond*(Prl[GT[g][0]] + Prl[GT[g][1]]) + twoPdeg), rangelengenotypes) # done loop over reads map(delete,marked) # delete marked reads # what is final read depth at position Gi in resequenced genome? depth = len(SGr) # don't print out info if position has no depth if depth < mindepth: continue # need to scale down the likelihoods so that natural scale values don't become 0 # we are currently on log scale so values are large negative exponents if sum(LL) < 1e-250: # that is some really small value # to avoid numerical underflow due to exponentiation of large negative LL values # need to find a positive constant C such that math.e^LL+C is computable, eg < 1000 # precursor step is to do blind rescaling to bring values near computable range # if you don't do this, you will inevitably set many positions values to equality # whereas they could be quantified just fine maxll = max(LL) while maxll < -200: c = -maxll/4. # this can't become positive LL = map(lambda x: x+c, LL) # print >> sys.stderr, 'Step:', lik maxll = max(LL) badvalue = -100000000000000000000000000000000 # token indicator of too small value LL = map(lambda x: (x < -250 and badvalue) or x, LL) # confirm that not all have badvalue if len(filter(lambda x: x!=badvalue,LL)) == 0: print >> sys.stderr, 'Warning: All likelihoods are 0 at position', Gi # then all likelihoods have the same value LL = map(lambda x: -250, LL) else: idx = argmax(LL) # get the largest likelihood candC = -LL[idx] - 100 # makes the max possible log likelihood = -100: 10**-100 = 1e-100 # all other liklihoods will be smaller than this LL = map(lambda x: (x == badvalue and -250) or (x+candC), LL) # DONE CORRECTION # Return likelihoods to natural scale LL = map(lambda x: 10**x, LL) # Allele frequency # ---------------- AFstr = ','.join( map(lambda nt: nt+':'+str(round(AF[nt]/float(depth),2)), nucleotides) ) # Posterior probabilities # ---------------------------------------------- # we now have a length-10 vector of likelihood values # for all reads at Gi given each genotype # calculate the posterior probability of each possible genotype # given the read set # we must calculate the PGxyR for all possible genomic string configurations C(SGxy(l)) # assuming genotype at position l is xy # so sum up prob over the C(SGxy(l)) for each genotype xy # so basic assumption is G = GR so that C(SGxy(l)) is simply the k-bounded # set of the reference: C(SGxy(l)) = C(SGRxy(l)) # this way there is no summing involved # Multiplicity ratio: d-n / n (or Ndeg / N), ranging from 0 to inf # aka the average number of additional alignments per read / locus # Sum(d_read) / Sum(reads) per locus # -------------------------------------------------- udalpha = round(harmonicMean(totalmultiplicity), 3) PGxyR = None try: # numerator: P(r1...rn | Gxy(Gi)) x P(Gxy(Gi)) if ploidy == 1: PGxyR = map(lambda g: LL[g]*PG[g,ref], rangelengenotypes) else: dnt = {'A':None,'T':None,'C':None,'G':None}; del dnt[ref] dL = max(map(lambda alt: LL[invGT[(ref,alt)]], dnt.keys())) homweight = max(1e-10, 1.-(udalpha+1)*theta) # so minimum P(RR) = e-10 hetweight = udalpha*theta PGxyR = map(lambda g: GT[g][0]==ref and GT[g][0]==GT[g][1] and \ LL[g]*homweight + dL*hetweight or \ LL[g]*PG[g,ref], rangelengenotypes) # denominator: Sum_xy{ P(r1...rn | Gxy(Gi)) x P(Gxy(Gi)) } Ptotal = float(sum(PGxyR)) except OverflowError: print 'snpper(overflow error):\nLL=',LL, '\nprior=', [PG[i,ref] for i in rangelengenotypes] continue # Final posterior probabilities for all genotypes at position Gi # -------------------------------------------------------------- # double up the heterozygous probabilities, e.g. P[] PGxyR = map(lambda i: I2xHet[genotypes[i]]*PGxyR[i], rangelengenotypes) Ptotal = float(sum(PGxyR)) PP = map(lambda x: x/Ptotal, PGxyR) # Determine MAP genotype and posterior # ------------------------------------ # take argmax, but multiple genotypes may have similar or identical PP. # This would happen if the pileup is highly degenerate. idx = argmax(PP) geno = genoUni[genotypes[idx]] MAP = PP[idx]>1 and 1 or PP[idx] # force to 1 Q = nastr try: Q = int(-10*log(1-MAP)) # convert to phred score # this occurs if MAP = 1, so force to some arbitrary minimum pvalue except ValueError: Q = lowestQ # this will be Q = 160 # occurs if MAP is < about 1e-16 except OverflowError: Q = lowestQ # this will be Q = 160 # Record statistics info = [chrom, str(Gi), ref, geno, str(Q), str(depth), str(multiplicity), str(sum(totalmultiplicity)), '%.3f'%(udalpha), 'N', AFstr] # fixed haploid/diploid issue genoref = (ploidy > 1 and iupac2nts(ref)) or ref if MAP == 0: info[3] = genoref # default to ancestor # print 'genoref', genoref, 'ref', ref, 'geno', geno # Assess nominal significance (P<0.05 and writes on the fly) # --------------------------- numHypotheses += 1 # we are evaluating a hypothesis if Q>=13 and genoref != geno: info[9] = 'Y' info = '\t'.join(info) if verbose: print info # print >>outfh, info # nominal file - deprecated else: info = '\t'.join(info) # print info print >>alloutfh, info # if printing all base calls # DONE LOOP OVER POSITIONS # DONE LOOP OVER CHROMOSOMES # DONE GENOME if outfile: outfh.close() if saveall: alloutfh.close() return numHypotheses # ----------------------------------------------------------------- # ----------------------------------------------------------------- # END LOCAL METHODS # CLI VARIABLES # ------------------------------------------------------------------- outdir = slash(os.getcwd()) btdir = '' btmapdir = ''; SETMAPDIR = False MAPCUT = False findir = '' mapfile = '' bfaref = '' zkey = 'sniper_results' # Default folder name for processed samples ########################### SNIPER PARAMETERS ########################### ######################################################################### # PART I: ALIGNMENT PARAMETERS # ---------------------------- RUNBOWTIE = False # align reads to reference genome using bowtie REALIGN = False # realign reads nprocs = 4 # number of processors per computer for bowtie assembly alignmentPolicy = 'bestm' # basic, best, all, bestk, bestm, maq-bestm, bestm-stratum, best-stratum mismatches = {'SE':1, 'PE':1} # number of mismatches for bowtie alignment (-v mismatches[x]) maxnumalign = 25 # maximum number of alignments a read could have, # subject to -mm mismathces (ONLY FOR policies bestm and bestk) # set this to the expected read coverage readfileformat = 'fastq' # or fasta COUNTREADS = False # count number of mapped reads from bowtie output qualstr = '--phred64-quals' # default is 64-base phred scores for Illumina GA 1.3+ trim5 = '' # trim 5' end of reads by trim5 bases trim3 = '' # trim 3' end of reads by trim3 bases COLORSPACE = False # use bowtie colorspace # mate pair insertion size bounds minins = 100; maxins = 400 # default lower and upper bounds on insert size GETINSERTSIZE = False # estimate PE insert size distributions insstat = '.99' # take as insert bounds the inner insstat percentiles # PART II: READ FILE ORGANIZATION # ------------------------------- ORGANIZEREADS = False # organize bowtie ouput into 4 read files per sampleID: unique/deg/PE/SE REORGANIZEREADS = False # redo this read organization KEEPBOWTIE = False # retain the bowtie output in addition to organized map files... # PART III: SNP CALLING # --------------------- SNPCALLING = False # whether or not to perform genotyping REDOSNPCALLING = False # whether to redo SNP calling on a sampleID which has already been typed REDOFINALSNPCALLING = False # correct for multiple testing to return final significant snps UNIQ = False # genotype using uniqely aligned reads only (--bestm -m 1) BEST = False # genotype using best-guess aligned reads only (--bestk -m 1) # if UNIQ = BEST = False, align using --bestk/m -m d USESINGLETONS = True # genotype using both PE and SE reads boundsfile = '' # file containing chrom\tlow\thigh rows for restricting genotyping singlesampleid = [] # name of single sampleID to genotype LOWSID = None; HIGHSID = None # if user wants to bound which samples are genotyped in a map file LIPieces = 25 # Break loading of unique reads into this number of pieces # greatly reduces memory requirements nthreads = 1 # number of active threads - process different sampleIDs to the same reference # unfortunately, python can't distribute over separate processor cores, # so setting > 1 is rather fruitless maxreadlen = 150 # currently global hardwired string length of all reads Lambda = 0.67 # weighting factor; larger values up-weight global # binomial pr. vs. read-specific phred prob. expPhred = 35 # Expected per-nucleotide sequencing error for global binomial likelihood model ESTIMATEGLOBALERROR = False # estimates the value for expPhred prior = 'resequence' # resequence, maq, uniform, haploid theta = 0.001 # expected human heterozygosity colorspacetheta = (3/2.)*theta # overall haploid divergence for sample--reference genome for bowtie colorspace conversion 0.000000198 # Default range of Q values for significant SNP identification QvalueRange = [0, 5, 13, 20, 40, 80, 100, 160] # SNP POSTPROCESSING SETTINGS # --------------------------- MAXALPHA = None # don't call any SNPs that have alpha > 10 MINALPHA = 0.0 POSTFILTER = False # similar to maxalpha but looks at num deg reads / num reads at locus # instead of num deg alignments vs num reads at locus DDEGCUTOFF = 0.9 # if more than 90% of depth is due to degenerate reads then can't be a SNP REDUNDANCYFILTER = None # Can optionally include a file of redunant loci to mask from SNP calling POSTPROCESSING = False # use a maq-like quality filter # allows 2 criteria to call a SNP FDEPTH = [10,50] # if depth at least x FDEG = [5,1] # if depth/degeneracy ratio at least x FQ = [13,13] # if posterior prob (as phred) at least x # OTHER PARAMETERS # ---------------- REDOALL = False # redo everything DIAGNOSTICS = False # generate summary statistics for genotype results SAVEASSEMBLY = False # save new genome using new SNP calls SKIPHIST = False specialReferenceFiles = None # Internal # READLENBYUSER = False DRYRUN = False # dryrun the pipeline SAVENOMINAL = False # save P < 0.05 SNPs while processing genome ########################################################################## ########################################################################## # CLI ARGS # ------------------------------------------------------------------- help = """ Help for sniper.py Help screen: sniper.py --help Version information: sniper.py --version Requires Unix/Mac OS X/CYGWIN with Python 2.5 or 2.6 (preferably 64-bit for memory allocation > 4GB) Standard execution: sniper.py -m -r -z Two-step execution for paired-end read data: 1. sniper.py -m -r --ins 2. sniper.py -m -r -z This will align raw NGS reads, perform genotyping, and provide diagnostics and new assemblies for all samples according to the map file. Results will be stored in a new directory 'sniper_results' in your current working directory. Standard arguments: * -m: map file (required) * -r: reference genome (required, fasta format) * -o: primary directory to save results directory (default: current working directory) * name: name of the results directory (default: sniper_results) * -z: perform/redo all procedures (equivalent to '-a -O -s --diagnostics --saveassmebly) * -a: align reads (generate read map) * -O: organize read map into singly-mapped and multiply-mapped reads * -s: SNP calling * --lip : number of separate pieces to load into memory; typically <= number of chromosomes in the reference genome (larger values are slower but use less memory) * --diagnostics: generate text file summarizing SNP calls * --saveassembly: create a new fasta file of the reference genome modified by significant SNP calls (NB it is recommended to use in conjunction with '-Q' to specify desired phred stringency. Ex: --saveassembly -Q 40 or --saveassembly -Q 20,30,40 Important parameters: sniper.py -m -r -z -k 2 -d 10 -e 30 --lambda 0.5 --readlength 38 --prior resequence --theta 0.01 Arguments: * -k: maximum number of alignment mismatches (default: 1) * -d: maximum number of alignments per read (default: 25) * -e: global sequencing error rate (specified as phred value; E.g. Q=30 == P<0.001) (default: 35) * --lambda: reweight the likelihood model towards read-specific or global binomial probability (default: 0.67 towards binomial) * --readlength: length of each sequence read (default: automatic) * --prior: prior probability distribution (default: resequence) Options: resequence: h0:1-theta-theta^2, t1:theta/2, h2:theta/2, t2:theta^2 resequence-het: h0:1-theta-theta^2, t1:theta, h2:theta^2/2, t2:theta^2/2 maq: h0:(1-theta-theta^2)/2, t1:theta, h2:(1-theta-theta^2)/2, t2:theta^2 uniform: h0:0.25, t1:0.25, h2:0.25, t2:0.25 haploid: h0:1-theta, t1:theta * --theta: expected divergence (heterozygous and homozygous) from reference genome (default: 0.001) Other notable arguments: * --peonly: only use paired end reads for SNP identification * --uniq: only use uniquely mapping reads for SNP identification * --colorspace: input files are in ABI SOLiD colorspace format (suffix .csfasta and .qual) NB, currently only works with single-end reads * --colorspacetheta: Haploid proportion of divergence between sample and reference genomes (default: 3/2 theta) * --loadinpieces/--lip: Integer number of groups into which unique reads will be broken (default: 10). Larger values will decrease memory requirements. * --stringency/-Q: Comma-delimited list of phred-quality scores for which to compute significance (Default: 0,5,10,13,20,30,40,50,60,70,80,90,100,110,120,130,140,150,155,160) * --restrict: Specify a region or file containing a list of regions over which to perform genotyping (default: all) EX: --restrict chrom01,0,1000,+ will genotype positions 0 to 1000 of chromosome 1, inclusive EX: --restrict chrom01,0,1000,- will genotype all of chromosome 1 excluding positions 0 to 1000 A boundary file should be a tab-delimited list of one or more regions similarly described, replacing commas with tabs. * To estimate the insert size range for paired-end reads, use '--ins' (see below) * To estimate the expected per-nucleotide sequencing error rate, use '--globalerror' * To restrict genotyping to a subset of samples specified in the map file, use '--sampleids sid1,sid2,...,sidx' Map file format: The map file is a standard tab-delimited text file indicating how each raw sequence reads file should be processed by sniper. The format is organized as one row per sequenced lane and requires 7 columns of information row, as follows: #Run Lane SampleID Alias RefGenome PairedEnd Path 1 1 myfirstgenome foo /path/to/ref_egenome 0 /path/to/reads/run1/ 2 5 mysecondgenome bar /path/to/ref_egenome 1 /path/to/reads/run2/ In this example, the first sample was run on lane 1 and contains single-end reads. The second sample was sequenced on lane 5 and contains paired-end reads. The standard mapper used with sniper.py is bowtie, so the specified reference is the name of the respective index file, excluding file suffix. The first line shows the headers but is treated as a comment; comments are discarded by Sniper. Raw sequence read files should contain as a substring the label indicated in the run column and placed in the /path/to/reads/directory/ directory. For example the first sample file may be named 's_1_sequence.txt' or 'testing1.txt'. The second sample should have 2 files since it is paired-end, named 's_5_1_sequence.txt' and 's_5_2_sequence.txt'. Colorspace files should take the format 's_1_sequence.csfasta' and 's_1_sequence.qual' for the fasta and quality files, respectively. Additional fields may be added to the map file after the required columns (e.g. concentration, date sequenced, author, etc.). Estimating insert size distribution for paired-end data: By default paired-end reads are mapped using an insert size range of 0 to 350 nt. Users may specify custom min and max bounds using * --minins: Specify the minimium insert size for all samples * --maxins: Specify the maximum insert size for all samples Alternatively use '--ins' to estimate the empirical insert size distribution for each sample specified in the map file: * --ins: Estimate insert size distributions for all samples specified in the map file. * --insstat : Use the percentile pct of the insert size distribution for alignment (default: 0.99) Note that alignments must first be generated using '-a'. A file 'PE_insert_size.txt' will be created in the specified output directory that Sniper will use by default if samples are mapped again (using '--realign/--ra'). Other CLI arguments: * --bowtie/-btd: Specify the base directory of your bowtie installation (e.g. --btd /usr/bin/bowtie-0.12.5) (default: '') * --keepbowtie/--kb: Do not delete the original bowtie SAM file following read organization * --policy: Specify Bowtie alignment policy (default: bestk) Options: * 'basic': 'bowtie -n -m 1 (Maq-like) * 'all': 'bowtie -v -a (end-to-k all alignments) * 'bestk': -v -k --best * 'bestm': -v -a -m --best * --peonly: utilize paired-end reads only for SNP calling * --kpe: Permit a maximum number of alignment mismatches for paired-end reads * --kse: Permit a maximum number of alignment mismatches for single-end reads * --qual33: Read qualities are on phred 33 scale instead of default phred 64 scale * --trimreads n5,n3: For alignment, consider a read substring from 1+n5...|r|-n3 * --colorspace/-C: NGS data are in ABI SOLiD colorspace * --realign/--ra: perform a new Bowtie alignment, replacing existing alignments * --reorganize/--rO: generate new organized read maps, replacing existing ones * --redosnp/--rs: redo SNP calling, replacing existing files in the snps/ subdirectory """ version = """sniper.py Version 1.0 Daniel F. Simola Laboratory of Junhyong Kim University of Pennsylvania http://kim.bio.upenn.edu/software/sniper.shtml """ nhelps = 0; helplimit = 0 args = sys.argv[:] argstr = ''.join(args) ai = 1 userformat = False while ai < len(args): arg = args[ai].strip('-').strip('--')#.lower() try: val = args[ai+1] except IndexError: val = '' if re.match('out|^o$', arg): outdir = slash(val) elif re.match('^bowtie$|^btd$', arg): btdir = slash(val) elif re.match('^map$|^m$', arg): mapfile = val elif re.match('^btmapdir$|^bmd$', arg): SETMAPDIR = True; btmapdir = val elif re.match('^mapcut$|^mc$', arg): MAPCUT = True; ai-=1 elif arg == 'fasta': readfileformat = 'fasta'; ai-=1 elif re.match('^name$', arg): zkey = val if zkey[len(zkey)-1] == '/': zkey = zkey[0:len(zkey)-1] elif re.match('^ref$|^b$|^r$', arg): bfaref = val elif re.match('policy|^pol$', arg): alignmentPolicy = val elif re.match('^mcap$|^d$', arg): maxnumalign = int(val) elif re.match('^qual33$', arg): qualstr = '--phred33-quals'; ai-=1 elif re.match('^trimreads$', arg): trim5,trim3 = map(int, val.split(',')) elif re.match('^colorspace$|^C$', arg): COLORSPACE = True; ai-=1 elif re.match('align|^a$', arg): RUNBOWTIE = True; ai-=1 elif re.match('realign|^ra$', arg): REALIGN = True; ai-=1 # elif re.match('countreads$', arg): COUNTREADS = True; ai-=1 elif re.match('usesingletons|pese|sepe|^g$', arg): USESINGLETONS = True; ai-=1 elif re.match('peonly|^pe$', arg): USESINGLETONS = False; ai-=1 elif re.match('^uniq$|^unique$', arg): # only return an alignment if it is unique in the reference genome # given specified maximum mismatches UNIQ = True; alignmentPolicy = 'bestm'; maxnumalign = 1; ai-=1 elif re.match('^best$|^bestguess$', arg): # return an alignment if it is the one with the fewest mismatches # given specified maximum mismatches BEST = True; alignmentPolicy = 'bestk'; maxnumalign = 1; ai-=1 elif arg == 'k': mismatches = {'SE':int(val), 'PE':int(val)} elif arg == 'kpe': mismatches['PE'] = int(val) elif arg == 'kse': mismatches['SE'] = int(val) elif arg == 'ins': GETINSERTSIZE = True; RUNBOWTIE = True; ai-=1 elif arg == 'insstat': insstat = str(val) elif re.match('minins', arg): minins = int(val) elif re.match('maxins', arg): maxins = int(val) # read organization elif re.match('organize|^O$', arg): ORGANIZEREADS = True; ai-=1 elif re.match('reorganize|^rO$', arg): ORGANIZEREADS = True; REORGANIZEREADS = True; KEEPBOWTIE = True; ai-=1 elif re.match('^keepbowtie$|^kb$', arg): KEEPBOWTIE = True; ai-=1 elif arg == 'skiphist': SKIPHIST = True; ai-=1 # SNP calling elif re.match('^snp$|^s$', arg): SNPCALLING = True; ai-=1 elif re.match('^redosnp$|^rs$', arg): SNPCALLING = True; REDOSNPCALLING = True; REDOFINALSNPCALLING = True; ai-=1 # this just re-executes the significant SNP filter elif re.match('^redocorrection$|^rc$', arg): SNPCALLING = True; REDOFINALSNPCALLING = True; ai-=1 elif re.match('^redocorrectionpost$|^rcf$', arg): SNPCALLING = True; REDOFINALSNPCALLING = True; POSTPROCESSING = True; ai-=1 elif re.match('^loadinpieces$|^lip$', arg): LIPieces = int(val) # Important parameters elif re.match('readlen|^rl$', arg): maxreadlen = int(val)#; READLENBYUSER = True elif re.match('lambda|^l$', arg): Lambda = float(val) elif re.match('expphred|^e$', arg): expPhred = int(val) elif re.match('globalerror|^ge$', arg): ESTIMATEGLOBALERROR = True; ai-=1 elif re.match('prior', arg): prior = val elif re.match('theta', arg): theta = float(val) elif re.match('colorspacetheta', arg): colorspacetheta = float(val) elif re.match('^stringency$|^Q$', arg): QvalueRange = map(int, val.split(',')) # restricting genotyping for a map file with multiple samples elif re.match('^restrict|^rest$', arg): boundsfile = val elif re.match('^sampleid$|^id$', arg): singlesampleid = [val] elif re.match('^sampleids$|^ids$', arg): singlesampleid = map(lambda x: x.strip(), val.split(',')) elif re.match('^subset|^sub$', arg): LOWSID,HIGHSID = map(int,val.split(',')) # postprocessing settings elif re.match('^maxalpha$|^mxa$', arg): MAXALPHA = float(val) elif re.match('^minalpha$|^mna$', arg): MINALPHA = float(val) elif re.match('^postfilter$', arg): POSTFILTER = True; ai -=1 elif re.match('^pfcutoff$', arg): DDEGCUTOFF = float(val) elif re.match('^repfilter$|^rf$', arg): REDUNDANCYFILTER = val # other settings elif re.match('redoall|^z$', arg): REDOALL = True; ai-=1 elif re.match('^diagnostics$|^diag$', arg): DIAGNOSTICS = True; ai-=1 elif re.match('noass|^na$', arg): SAVEASSEMBLY = False; ai-=1 elif re.match('saveass|^sa$', arg): SAVEASSEMBLY = True; ai-=1 elif re.match('^threads$|^p$', arg): nthreads = int(val) elif re.match('^srefs$|^sr$', arg): specialReferenceFiles = val # for --saveassembly only elif re.match('^nominal$|^noms$', arg): SAVENOMINAL = True; ai -=1 elif re.match('^help|h$', arg.lower()): sys.exit(help) elif re.match('^version|v$', arg.lower()): sys.exit(version) else: astr = "=> The specified argument \""+arg+"\" does not parse. Try --help for more information." sys.exit(astr) ai += 2 # ------------------------------------------------------------------- # contingency of flags # -------------------- if REDOALL: RUNBOWTIE = REALIGN = True ORGANIZEREADS = REORGANIZEREADS = True SNPCALLING = REDOSNPCALLING = REDOFINALSNPCALLING = True DIAGNOSTICS = True SAVEASSEMBLY = True if ORGANIZEREADS and not bfaref: sys.exit('Erorr: reference fasta file must be provided with -O using --ref/-r .') # declaration of directories # -------------------------- zdir = outdir+zkey+'/' if not SETMAPDIR: btmapdir = zdir+'read_maps_all/' unmapdir = zdir+'unmapped_reads/' pmadir = zdir+'alignment_mismatches/' if UNIQ: if not SETMAPDIR: btmapdir = zdir+'read_maps_uni/' unmapdir = zdir+'unmapped_reads_uni/' pmadir = zdir+'alignment_mismatches_uni/' if BEST: if not SETMAPDIR: btmapdir = zdir+'read_maps_best/' unmapdir = zdir+'unmapped_reads_best/' pmadir = zdir+'alignment_mismatches_best/' zinsdir = zdir+'PE_insert_size/' tmpstr = 'ALL' if UNIQ: tmpstr = 'UNI' if BEST: tmpstr = 'BEST' resdir = zdir+'results-L%s,E%s,%s,%s/'%(Lambda, expPhred, USESINGLETONS and 'PESE' or 'PE', tmpstr) del tmpstr findir = resdir+'assembled_genomes/' snpdir = resdir+'snps/' phylodir = resdir+'SNPstrings/' tmpdir = resdir+'tmp/' # file suffix depending on map type ubamap = '-all' if UNIQ: ubamap = '-uniq' if BEST: ubamap = '-best' # create new directories # ---------------------- createdir(outdir) createdir(zdir) createdir(btmapdir) createdir(zinsdir) createdir(unmapdir) createdir(pmadir) # save a log file with the user's input arguments logfile = zdir+'log_file.txt' iomethod = 'w' if os.access(logfile, os.F_OK): iomethod = 'a' fh = open(logfile, iomethod) print >> fh, ' '.join(args) fh.close() # parse map file and group replicates # ---------------------------------------- mapd,runs,head = readTable(mapfile,header=0) seqRuns = {} # keyed by sampleID ct = 0 totalsamples = 0 for i in range(len(runs)): ct += 1 if MAPCUT and (ct < LOWSID or ct > HIGHSID): continue totalsamples += 1 run,lane,sampleID,alias,refGenome,paired,path = runs[i], mapd[i][0], mapd[i][1], mapd[i][2], mapd[i][3], int(mapd[i][4]), mapd[i][5] if path == '': path = '/' else: path = slash(path) # make absolute path if local directory specified if path[0] != '/' or path == '/': abspath = slash(os.getcwd()) # if this does not end at the current local path, add path if slash(abspath.split('/')[-2]) != path: abspath += path path = abspath path = slash(path) try: seqRuns[sampleID][(run,lane)] = [refGenome, paired, path] except KeyError: seqRuns[sampleID] = {(run,lane):[refGenome, paired, path]} pipeit('- Read %s samples from map file.'%(totalsamples),1) # --------------------------------------------------------- sortseqruns = seqRuns.keys() sortseqruns.sort() if not MAPCUT: keeps = dict(map(lambda x: (x,None), sortseqruns[LOWSID:HIGHSID])) for sid in sortseqruns: if sid not in keeps: del seqRuns[sid] if len(singlesampleid): sr2 = {} for sid in singlesampleid: if sid in seqRuns: bak = seqRuns[sid] sr2[sid] = bak else: print 'Warning: sample %s not found in map file.'%(sid) seqRuns = sr2 sortseqruns = seqRuns.keys() sortseqruns.sort() if LOWSID or HIGHSID: print '- Processing samples %s'%(sortseqruns[LOWSID:HIGHSID]) else: print '- Processing %s sampleIDs'%(len(sortseqruns)) # --------------------------------------------------------- # load bounds file boundslabel = '' seqbounds = {} if boundsfile and os.access(boundsfile, os.F_OK): d,r,c = readTable(boundsfile, header=False) for i in range(len(d)): tmp = [] try: tmp += [intna(d[i][0])] except ValueError: tmp += [nastr] try: tmp += [intna(d[i][1])] except ValueError: tmp += [nastr] tmp += [d[i][2]] # flag - + or - try: seqbounds[r[i]] += [tmp] except KeyError: seqbounds[r[i]] = [tmp] parts = boundsfile.split('/') crap, boundslabel, crap = re.split('(.+)\..+$', parts[len(parts)-1]) boundslabel = '.'+boundslabel elif boundsfile: # this may also be a string of a particular bound: chr01, 500, 1000 splt = boundsfile.split(',') if len(splt) == 1: loc = splt[0] seqbounds[loc] = [(nastr, nastr, '+')] # boundslabel = '.%s,%s,%s,%s'%(loc,nastr,nastr,'+') boundslabel = '.%s'%(loc) elif len(splt) == 3: loc, low, high = map(lambda x: x.strip(' '), splt) seqbounds[loc] = [(intna(low), intna(high), '+')] boundslabel = '.%s,%s,%s,%s'%(loc,low,high,'+') elif len(splt) == 4: loc, low, high,flag = map(lambda x: x.strip(' '), splt) seqbounds[loc] = [(intna(low), intna(high), flag)] boundslabel = '.%s,%s,%s,%s'%(loc,low,high,flag) else: sys.exit('Restriction flag not specified properly: '%(boundsfile)) # DONE INITIALIZATION OF VARIABLES # ========================================================= # Part I: Align reads to reference genome # ========================================================= parts = mapfile.split('/') peinsfile = zinsdir+'PE_insertsize_table-%s'%(parts[len(parts)-1]) peinsize = {} # dict specifying PE insert size statistics, keyed by sampleID if os.access(peinsfile, os.F_OK): d,r,c = readTable(peinsfile, header=True) for i in range(len(d)): peinsize[r[i]] = {} for j in range(len(c)): peinsize[r[i]][c[j]] = (d[i][j] != '' and floatna(d[i][j])) or nastr if RUNBOWTIE or REALIGN: print '\nPart I: GENERATING READ MAPS USING BOWTIE...' # call bowtie on all samples in mapfile for sampleID in sorted(seqRuns.keys()): # set parm str each time here because insert size may be sample specific try: minins1 = peinsize[sampleID]['min'+insstat] maxins1 = peinsize[sampleID]['max'+insstat] if minins1 not in [nastr, '']: minins = minins1 if maxins1 not in [nastr, '']: maxins = maxins1 except KeyError: pass print '- ID %s: if paired-end, min/max insert size = [%s,%s]'%(sampleID, minins, maxins) parmstr = '-t --pairtries 300 %s '%(qualstr) sinparmstr = '-t %s '%(qualstr) if readfileformat == 'fasta': parmstr += ' -f '; sinparmstr += ' -f ' if trim5: astr = '--trim5 %s '%(trim5) parmstr += astr sinparmstr += astr if trim3: astr = '--trim3 %s '%(trim3) parmstr += astr sinparmstr += astr if alignmentPolicy == 'basic': # for the maq-like policy parmstr += '--minins %s --maxins %s -n %s -m 1'%(minins, maxins, mismatches['PE']) sinparmstr += '-n %s -m 1'%(mismatches['SE']) # for singleton samples elif alignmentPolicy == 'all': # end-to-k policy (-v) with all valid alignments reported (-a) parmstr += '--minins %s --maxins %s -v %s -a'%(minins, maxins, mismatches['PE']) sinparmstr += '-v %s -a'%(mismatches['SE']) # for singleton samples elif alignmentPolicy == 'bestk': parmstr += '--minins %s --maxins %s -v %s -k %s --best'%(minins, maxins, mismatches['PE'], maxnumalign) sinparmstr += '-v %s -k %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples elif alignmentPolicy == 'bestm': parmstr += '--minins %s --maxins %s -v %s -a -m %s --best'%(minins, maxins, mismatches['PE'], maxnumalign) sinparmstr += '-v %s -a -m %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples elif alignmentPolicy == 'maq-bestm': parmstr += '--minins %s --maxins %s -n %s -a -m %s --best'%(minins, maxins, mismatches['PE'], maxnumalign) sinparmstr += '-n %s -a -m %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples elif alignmentPolicy == 'bestm-stratum': parmstr += '--minins %s --maxins %s -v %s -a -m %s --best --strata'%(minins, maxins, mismatches['PE'], maxnumalign) sinparmstr += '-v %s -a -m %s --best --strata'%(mismatches['SE'], maxnumalign) # for singleton samples elif alignmentPolicy == 'best-stratum': parmstr += '--minins %s --maxins %s -v %s -a --best --strata'%(minins, maxins, mismatches['PE']) sinparmstr += '-v %s -a --best --strata'%(mismatches['SE']) # for singleton samples # output in sam format parmstr += ' --sam'; sinparmstr += ' --sam' for run,lane in seqRuns[sampleID].keys(): refGenome,paired,path = seqRuns[sampleID][(run,lane)] UID = '%s-%s-%s'%(sampleID,run,lane) # find the file that is indicated in the map file fastqFiles = sorted(getFiles(path, include=lane)) # print 'TEST fastqfiles', fastqFiles # sys.exit() if paired: # remove extraneous files that do not have the _1 or _2 in them fastqFiles = filter(lambda x: ('_1' in x or '_2' in x) and True, fastqFiles) if len(fastqFiles) > 2: print >> sys.stderr, 'Warning: Found more than 2 matching read files. Using default s_lane_x_sequence.txt format.' fastqFiles = [path+'s_%s_1_sequence.txt'%(lane), path+'s_%s_2_sequence.txt'%(lane)] elif len(fastqFiles) < 2: print >> sys.stderr, 'Check read file names or path. Did not find 2 files for sample %s in path "%s": %s'%(lane, path, fastqFiles) call = '%sbowtie %s -p %s "%s" -1 "%s" -2 "%s" "%s%s.map" --un "%s%s_unfq.fq"'%(btdir,parmstr,nprocs, refGenome, fastqFiles[0], fastqFiles[1], btmapdir, UID, btmapdir, UID) # call = '%sbowtie %s -p %s "%s" -1 "%ss_%s_1_sequence.txt" -2 "%ss_%s_2_sequence.txt" "%s%s.map" --un "%s%s_unfq.fq"'%(btdir,parmstr,nprocs, refGenome,path,lane,path,lane,btmapdir,UID,btmapdir,UID) print '- Assembling %s paired end data...'%(UID) print call if not DRYRUN and (not os.access(btmapdir+UID+'.map', os.F_OK) or REALIGN): os.system(call) print # set this up to process the remaining singletons as well, and then merge the output files... # outfile is _unfq.fq if 1: # USESINGLETONS print UID, '- Assembling PE left-over singleton data...' for lane in [1,2]: call2 = '%sbowtie %s -p %s "%s" "%s" "%s" --un "%s"'%(btdir, parmstr, nprocs, refGenome, btmapdir+UID+'_unfq_%s.fq'%(lane), btmapdir+UID+'_singles_%s.map'%(lane), unmapdir+UID+'_unmapped_%s.fq'%(lane)) print call2 if not DRYRUN and (not os.access(btmapdir+UID+'_singles_%s.map'%(lane), os.F_OK) or REALIGN): os.system(call2) print # remove the input fq file, contents of which are in singles_map and unmapped os.system('rm -f "%s"'%(btmapdir+UID+'_unfq_%s.fq'%(lane))) else: # rename the --un files from the PE run as unmapped and move them into the unmapped folder for lane in [1,2]: call = 'mv %s %s'%(btmapdir+UID+'_unfq_%s.fq'%(lane), unmapdir+UID+'_unmapped_%s.fq'%(lane)) os.system(call) else: if not COLORSPACE: if len(fastqFiles) > 2: print >> sys.stderr, 'Warning: Found more than 2 matching read files. Using default s_lane_x_sequence.txt format.' fastqFiles = [path+'s_%s_1_sequence.txt'%(lane), path+'s_%s_2_sequence.txt'%(lane)] elif len(fastqFiles) != 1: print >> sys.stderr, 'Check read file names or path. Did not find 1 file for sample %s in dir "%s": %s'%(lane, path, fastqFiles) # len(fastqFiles) != 1: # sys.exit('Check read file names. Did not find 1 file for sample %s: %s'%(lane, fastqFiles)) call = '%sbowtie %s -p %s "%s" "%s" "%s%s.map" --un "%s%s_unmapped.fq"'%(btdir, sinparmstr, nprocs, refGenome, fastqFiles[0], btmapdir, UID, unmapdir, UID) # call = '%sbowtie %s -p %s "%s" "%ss_%s_sequence.txt" "%s%s.map" --un "%s%s_unmapped.fq"'%(btdir, sinparmstr, nprocs, refGenome, path, lane, btmapdir, UID, unmapdir, UID) if COLORSPACE: if len(fastqFiles) != 2: print >> sys.stderr, 'Check read file names or path. Did not find 2 files (.csfasta and .qual) for sample %s in file "%s": %s'%(lane,path, fastqFiles) # call = '%sbowtie %s -C -p %s "%s" -f "%ss_%s_sequence.csfasta" -Q "%ss_%s_sequence.qual" "%s%s.map" --un "%s%s_unmapped.fq" --snpfrac %s'%(btdir, sinparmstr, nprocs, refGenome, path, lane, path,lane, btmapdir, UID, unmapdir, UID, colorspacetheta) call = '%sbowtie %s -C -p %s "%s" -f "%s" -Q "%s" "%s%s.map" --un "%s%s_unmapped.fq" --snpfrac %s'%(btdir, sinparmstr, nprocs, refGenome, fastqFiles[0], fastqFiles[1], btmapdir, UID, unmapdir, UID, colorspacetheta) print '- Assembling original singleton data for %s...'%(UID) print call if not DRYRUN and (not os.access(btmapdir+UID+'.map', os.F_OK) or REALIGN): os.system(call) print print '=> DONE', sampleID, '\n' print 'DONE PART I: READ ALIGNMENT FOR ALL SAMPLES' print # ========================================================= # Part IB: Assay raw read coverage / Insert size stats # ========================================================= # if COUNTREADS: # print '- Loading reference genome...' # G = readFasta(bfaref) # # chromosome total lengths (bp) # chromlengths = dict( map(lambda k: (k, len(G[k]['seq'])), G) ) # master_chroms = chromlengths.keys() # # readbphead = ['SampleID', 'Run', 'Lane', 'PE', 'Reads', 'Bases', 'Coverage'] # readbptab = [] # # print 'Computing number of reads for samples...' # for sampleID in sorted(seqRuns.keys()): # for run,lane in seqRuns[sampleID].keys(): # refGenome,paired,path = seqRuns[sampleID][(run,lane)] # # count up reads and bases # if paired: # for pe in [1]: # don't need the second because it has same number of reads # call = "wc -l %s > %s"%(path+'s_'+lane+'_'+str(pe)+'_sequence.txt', outdir+'tmpread.txt') # print 'call', call # os.system(call) # line = readList(outdir+'tmpread.txt') # nreads, crap = line[0].split(' ') # nreads = float(nreads)/2.0 # each read takes up 2 rows in file # row = [sampleID, run, lane, pe,'%.5e'%(float(nreads)), '%.5e'%(float(nreads)*readlen), '%.2f'%(float(nreads)*readlen/float(2*sum(chromlengths.values())))] # readbptab += [row] # os.system('rm "%s"'%(outdir+'tmpread.txt')) # else: # print 'have not implemented single end raw read counting yet...' # printFormattedTable(readbptab, readbphead, file=zdir+'Raw reads summary-%s.txt'%(zkey)) # # sys.exit('Done counting raw reads. Exiting...') # print if GETINSERTSIZE: # save an insert size histogram for each sampleID and a summary table providing .95, .99, and min/max stats # this only works in bowtie saved output as sam maps print 'Computing PE insert size distributions...' peinsize = [] for sampleID in sorted(seqRuns.keys()): insvalues = [] for run,lane in seqRuns[sampleID].keys(): refGenome,paired,path = seqRuns[sampleID][(run,lane)] UID = '%s-%s-%s'%(sampleID,run,lane) # only used for paired end reads if paired: # get data from the sam/bam file fh = open(btmapdir+UID+'.map') for line in fh.readlines(): if line[0]=='@': continue row = line.strip('\n').split('\t') # print 'row has', len(row) # print row insvalue = abs(int(row[8])) if insvalue != 0: insvalues += [insvalue] fh.close() insvalues = filter(lambda x: x!=nastr and True, insvalues) if len(insvalues) == 0: print '- Unable to read read map for %s'%(sampleID) else: print '- %s has %s values'%(sampleID, len(insvalues)) print ' saving histogram...' hist = histogram(insvalues, nbins=25, yfreq=True) printTable(hist, ['Size', 'Count'], file=zinsdir+sampleID+' histogram.txt') print ' calculating statistics...' ansd = sd(insvalues) meanins = avg(insvalues) sdL = meanins - ansd; sdR = meanins + ansd ninefiveL = percentile(insvalues, .025) ninefiveR = percentile(insvalues, .975) ninenineL = percentile(insvalues, .005) ninenineR = percentile(insvalues, .995) mn = min(insvalues); mx = max(insvalues) perow = [sampleID, meanins, sdL, sdR, ninefiveL, ninefiveR, ninenineL, ninenineR, mn, mx] print perow peinsize += [perow] header = ['ID', 'Mean', 'minSD', 'maxSD', 'min.95', 'max.95', 'min.99', 'max.99', 'min', 'max'] printTable(peinsize, header, file=peinsfile) print '=> Finished estimating insert size distributions.\n\nRerun sniper.py using the -ra flag to realign reads using these estimated insert size bounds.\n' sys.exit() # ========================================================== # Part II: generate final read files grouped by multiplicity # ========================================================== GR = None # we need the reference genome here... flg = None NEEDTOLOADREFERENCE = True # if this flag is triggered, redo multiplicity filter and then perform SNP calling if ORGANIZEREADS: print '\nPART II: READ ORGANIZATION' if NEEDTOLOADREFERENCE: sys.stdout.write(' - Loading reference genome... '); sys.stdout.flush() GR = readFasta(bfaref, TOUPPER=True) flg = sum(map(lambda x: len(x['seq']), GR.values())) sys.stdout.write('N=%s loci\n'%(flg)) NEEDTOLOADREFERENCE = False GRlengths = dict( map(lambda x: (x, len(GR[x]['seq'])), GR.iterkeys()) ) # touch new summary tables pmafile = zdir+'Summary-Perfect vs mismatch alignments%s.txt'%(ubamap) if not os.access(pmafile, os.F_OK): header=['Sample', 'Perfect alignments', 'Mismatched alignments', 'Total alignments', 'Frac perfect', 'Frac mismatch'] record('\t'.join(header), pmafile, 'w') almfile = zdir+'Summary-alignment multiplicity%s.txt'%(ubamap) if not os.access(almfile, os.F_OK): record('Sample: SE and PE multiplicity distributions (alignments, reads)', almfile, 'w') crdfile = zdir+'Summary-chromosome read distribution.txt' if not os.access(crdfile, os.F_OK): record('SampleID\tChrom\tUnique SE\tUnique PE\tNon-unique SE\tNon-unique PE', crdfile, 'w') currentsample = 0 for sampleID in sorted(seqRuns.keys()): sys.stdout.write(' - Looking at sample: %s\n'%(sampleID)); sys.stdout.flush() if (not REORGANIZEREADS) and os.access(btmapdir+sampleID+'.PE.unique.map', os.F_OK): sys.stdout.write(' Already processed.\n'); sys.stdout.flush() continue # record read multiplicity - number of times each read appears # ------------------------------------------------------------ print ' - Assessing read multiplicity...' # addend = {'PE':.5, 'SE':1.0} # used as a lookup below # per sampleID variables of interest npmms = 0; nrows = 0 # count up total rows and total rows with mismatches totreads = {'SE':0, 'PE':0, 'Total':0} totaln = {'SE':0, 'PE':0, 'Total':0} tmpTotal = {} # aggregate into total histogram # for each sampleID save new SAM-format read files # Partition reads into 2 multiplicity groups funi = open(btmapdir+sampleID+'.PE.unique.map', 'w') funiSE = open(btmapdir+sampleID+'.SE.unique.map', 'w') fdeg = open(btmapdir+sampleID+'.PE.degenerate.map', 'w') fdegSE = open(btmapdir+sampleID+'.SE.degenerate.map', 'w') ulinesat = {'PE':{}, 'SE':{}} dlinesat = {'PE':{}, 'SE':{}} isizes = [] # the list of (run,lane) pairs of samples corresponding to this sampleID thesamples = seqRuns[sampleID].keys(); thesamples.sort() # potentially distribute this across CPU? for run,lane in thesamples: # multiplicity counting by read # key is the hitid, value is count, both reads from a mate pair count at 1 RID = {'SE':{}, 'PE':{}} currentsample += 1 UID = '%s-%s-%s'%(sampleID,run,lane) refGenome,paired,path = seqRuns[sampleID][(run,lane)] rlp = '%s,%s-'%(run,lane) # run,lane prefix for all reads fpairs = [] if paired == 1: # filename for standard PE run fpairs += [('PE', btmapdir+UID+'.map')] # filenames for singleton leftovers from PE fpairs += [('SE', btmapdir+UID+'_singles_1.map')] fpairs += [('SE', btmapdir+UID+'_singles_2.map')] else: # filename for singleton run fpairs += [('SE', btmapdir+UID+'.map')] for pese,fname in fpairs: if os.access(fname, os.F_OK): parts = fname.split('/') sys.stdout.write(' - %s Counting alignments for %s...'%(currentsample, parts[len(parts)-1])) sys.stdout.flush() fh = open(fname) count = 0 for row in fh: if row[0]=='@': continue if count % 5000000 == 0: sys.stdout.write('%s,'%(count)); sys.stdout.flush() count += 1 lst = row[:-1].split('\t') try: hitid = lst[0] chrom = lst[2] rem = ' '.join(lst[11:len(lst)]) except IndexError: continue if chrom == '*': continue # record read ids with multiplicity # base = hitid.strip('/1').strip('/2') # hitid = rlp+base # this id is unique across runs # try: RID[pese][hitid] += addend[pese] # don't double count a PE read # except KeyError: RID[pese][hitid] = addend[pese] # record read ids with multiplicity hitid = rlp+hitid # this id is globally unique across runs # replace hitid with unique integer value hashedid = hash(hitid) # each read of a paired end read is considered to be distinct try: RID[pese][hashedid] += 1 except KeyError: RID[pese][hashedid] = 1 # record mismatches tmp = re.split('.*NM:i:(\d+).*', rem) npmms += (len(tmp)>1 and int(tmp[1])>0 and 1) or 0 nrows += 1 fh.close() sys.stdout.write('Done.\n'); sys.stdout.flush() else: print 'Cannot access', fname # end loop reading read info # RID is now populated # ------------------------------- # reorganize RIDSE and RIDPE as dictionaries keyed by multiplicity # to quickly generate a histogram # sys.stdout.write(' - Computing multiplicity histogram...\n'); sys.stdout.flush() # RIDm = {'SE':dict(map(lambda x: (x,0), range(1,maxnumalign+1))), # 'PE':dict(map(lambda x: (x,0), range(1,maxnumalign+1)))} # quickly generate histograms of multiplicity hist = {'SE':[], 'PE':[], 'Total':[]} # RID is keyed by SE or PE returning a dictionary of hitid,count pairs if SKIPHIST: sys.stdout.write('skipping...too many reads\n'); sys.stdout.flush() else: for sp,ridsp in RID.items(): RIDm = dict(map(lambda x: (x,0), xrange(1,maxnumalign+1))) sys.stdout.write(' %s...'%(sp)); sys.stdout.flush() for hitid,val in ridsp.iteritems(): try: RIDm[val] += 1 except KeyError: RIDm[val] = 1 hist[sp] = map(lambda val: (val,RIDm[val]), RIDm.keys()) totreads[sp] += sum([foo[1] for foo in hist[sp]]) totaln[sp] += sum([foo[0]*foo[1] for foo in hist[sp]]) sys.stdout.write('DONE.'); sys.stdout.flush() del RIDm totreads['Total'] += totreads['SE']+totreads['PE'] totaln['Total'] += totaln['SE']+totaln['PE'] for sp in ['SE', 'PE']: for val,vct in hist[sp]: try: tmpTotal[val] += vct except KeyError: tmpTotal[val] = vct # for each sampleID save a new SAM-format read file for unique and non-unique reads # --------------------------------------------------------------------------------- pipeit('\n - Saving partitioned read maps...', 1) # for run,lane in subsamples: # UID = '%s-%s-%s'%(sampleID,run,lane) # refGenome,paired,path = seqRuns[sampleID][(run,lane)] # rlp = '%s,%s-'%(run,lane) # run,lane prefix for all reads thefiles = [btmapdir+UID+'.map'] if os.access(btmapdir+UID+'_singles_1.map', os.F_OK): thefiles += [btmapdir+UID+'_singles_1.map'] if os.access(btmapdir+UID+'_singles_2.map', os.F_OK): thefiles += [btmapdir+UID+'_singles_2.map'] for fn in thefiles: label = tail(fn.split('/')) sys.stdout.write(' - Processing %s...'%(label)); sys.stdout.flush() fh = open(fn) rowcnt = 0 for row in fh: if rowcnt % 2500000 == 0: sys.stdout.write('%s,'%(rowcnt)); sys.stdout.flush() rowcnt += 1 linen = row[:-1]#.strip('\n') prelinen = rlp+linen # only header to keep is the @PG line which indicates bowtie call # if row[0:3]=='@SQ': continue if row[0:3]=='@PG': print>>funi, linen; print>>fdeg, linen; continue elif row[0]=='@': continue lst = linen.split('\t') hitid = rlp+lst[0] chrom = lst[2] hasmate = lst[6]; insertsize=int(lst[8]) seq=lst[9]; qual=lst[10] # toss unmapped reads if chrom == '*': continue # don't double count, so only record positive values # only keep a fraction of them to save on memory if hasmate != '*' and insertsize >= 0 and random.random < 0.1: isizes += [insertsize] # save read by its multiplicity hashedid = hash(hitid) if hashedid in RID['SE']: if RID['SE'][hashedid] == 1: print >>funiSE, prelinen # get our own line count for proper sam file format try: ulinesat['SE'][chrom] += 1 except KeyError: ulinesat['SE'][chrom] = 1 else: print >>fdegSE, prelinen # get our own line count for proper sam file format try: dlinesat['SE'][chrom] += 1 except KeyError: dlinesat['SE'][chrom] = 1 elif hashedid in RID['PE']: if RID['PE'][hashedid] == 1: print >>funi, prelinen # get our own line count for proper sam file format try: ulinesat['PE'][chrom] += 1 except KeyError: ulinesat['PE'][chrom] = 1 else: print >>fdeg, prelinen # get our own line count for proper sam file format try: dlinesat['PE'][chrom] += 1 except KeyError: dlinesat['PE'][chrom] = 1 sys.stdout.write('Done.\n'); sys.stdout.flush() fh.close() # END LOOPING OVER FILES # Save results for this sampleID # move on to next group if there are no mapped reads if nrows == 0: continue # record number of perfect/mismatch alignments perfrac = 0 nper = nrows - npmms row = [] perfrac = nper/float(nrows) mmfrac = npmms/float(nrows) row += ['There are %s/%s (%s) perfect (%s/%s) alignments'%(nper,nrows,perfrac,maxreadlen,maxreadlen)] row += ['There are %s/%s (%s) mismatched (<%s/%s) alignments'%(npmms,nrows,mmfrac,maxreadlen,maxreadlen)] printList(row, file=pmadir+'Perfect vs mismatch alignments-%s.txt'%(sampleID)) # save perfect/mismatch reads summary info = '\t'.join(map(str, [sampleID, nper, npmms, nrows, perfrac, mmfrac])) record(info, pmafile, 'a') # save read and multiplicity statistics sk = tmpTotal.keys(); sk.sort() hist['Total'] = map(lambda val: (val, tmpTotal[val]), sk) sestr = ', '.join('('+','.join(map(str,p))+')' for p in hist['SE']) pestr = ', '.join('('+','.join(map(str,p))+')' for p in hist['PE']) totstr = ', '.join('('+','.join(map(str,p))+')' for p in hist['Total']) del hist # clean up if sestr == '': sestr = nastr if pestr == '': pestr = nastr recfile = almfile record(sampleID, recfile, 'a') record('SE: reads=%s, alignments=%s, alignments/read=%s'%(totreads['SE'], totaln['SE'], roundna(3)(divna(totaln['SE'],totreads['SE']))), recfile, 'a') record('Histogram: '+sestr, recfile, 'a') record('PE: reads=%s, alignments=%s, alignments/read=%s'%(totreads['PE'], totaln['PE'], roundna(3)(divna(totaln['PE'], totreads['PE']))), recfile, 'a') record('Histogram: '+pestr, recfile, 'a') record('Total: reads=%s, alignments=%s, alignments/read=%s'%(totreads['Total'], totaln['Total'], roundna(3)(divna(totaln['Total'], totreads['Total']))), recfile, 'a') record('Histogram: '+totstr, recfile, 'a') record('', recfile, 'a') # Done saving statistics for this sampleID # ---------------------------------------------------- funi.close(); fdeg.close() funiSE.close(); fdegSE.close() print ' - done partitioning unique and degenerate reads.' # ============================================================== # save the number of SE/PE reads on each chromosome fh = open(zdir+'Summary-chromosome read distribution.txt', 'a') # this prints the number of reads on each chromosome for k in sorted(GR.keys()): use = upe = dse = dpe = 0 try: use = ulinesat['SE'][k] except KeyError: pass try: upe = ulinesat['PE'][k] except KeyError: pass try: dse = dlinesat['SE'][k] except KeyError: pass try: dpe = dlinesat['PE'][k] except KeyError: pass print >>fh, '\t'.join(map(str, [sampleID, k, use, upe, dse, dpe])) fh.close() # ============================================================== # ========================================= # sort read maps (not currently implemented) # for pese in ['PE', 'SE']: # os.system('sort -t "\t" -k 2,3n "%s" > "%s"'%(btmapdir+sampleID+'.'+pese+'.unique.map', btmapdir+sampleID+'.'+pese+'.unique.map')) # os.system('sort -t "\t" -k 2,3n "%s" > "%s"'%(btmapdir+sampleID+'.'+pese+'.degenerate.map', btmapdir+sampleID+'.'+pese+'.degenerate.map')) # alternatively could use samtools for sorting... # ========================================= sys.stdout.write(' - saving sam file headers...'); sys.stdout.flush() for pese in ['PE', 'SE']: # unique map fh = open(btmapdir+sampleID+'.'+pese+'.uheader.txt', 'w') print >>fh, '\t'.join(['@HD', 'VN:1.0', 'SO:unsorted']) for k,v in sorted(GRlengths.items()): print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)]) fh.close() # concatenate header and body together call = 'cat "%s" >> "%s"'%(btmapdir+sampleID+'.'+pese+'.unique.map', btmapdir+sampleID+'.'+pese+'.uheader.txt') os.system(call) call = 'mv "%s" "%s"'%(btmapdir+sampleID+'.'+pese+'.uheader.txt', btmapdir+sampleID+'.'+pese+'.unique.map') os.system(call) # degenerate map fh = open(btmapdir+sampleID+'.'+pese+'.dheader.txt', 'w') print >>fh, '\t'.join(['@HD', 'VN:1.0', 'SO:unsorted']) # this prints the number of reads on each chromosome # for k,v in sorted(dlinesat[pese].items()): # print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)]) for k,v in sorted(GRlengths.items()): print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)]) fh.close() # concatenate header and body together call = 'cat "%s" >> "%s"'%(btmapdir+sampleID+'.'+pese+'.degenerate.map', btmapdir+sampleID+'.'+pese+'.dheader.txt') os.system(call) call = 'mv "%s" "%s"'%(btmapdir+sampleID+'.'+pese+'.dheader.txt', btmapdir+sampleID+'.'+pese+'.degenerate.map') os.system(call) sys.stdout.write('Done.\n'); sys.stdout.flush() sys.stdout.write(' - saving empirical insert size distribution...'); sys.stdout.flush() hist = histogram(isizes[0:100000], nbins=100) printTable(hist, header=['Size', 'Count'], file=zinsdir+'Insert size distribution.txt') sys.stdout.write('Done.\n'); sys.stdout.flush() # clean up del RID; del ulinesat; del dlinesat; del isizes # finally delete the precursor bowtie files # ----------------------------------------- if not KEEPBOWTIE: for run,lane in seqRuns[sampleID].keys(): UID = '%s-%s-%s'%(sampleID,run,lane) refGenome,paired,path = seqRuns[sampleID][(run,lane)] thefiles = [btmapdir+UID+'.map', btmapdir+UID+'_singles_1.map', btmapdir+UID+'_singles_2.map'] for fn in thefiles: os.system('rm -f "%s"'%(fn)) print 'DONE PART II: READ ORGANIZATION.\n' if ESTIMATEGLOBALERROR: log = lambda x: math.log(x, 10) Q2P = lambda e: (e == 0 and 1-1e-16) or 10**(-e/10.) # avoid 1 P2Q = lambda p: -10*log(p) A2P = dict( [ascii, Q2P(A2Q[ascii])] for ascii in A2Q.keys() ) print 'Estimating global per-nucleotide sequencing error rate...' errorsum = 0 ecount = 0 # errors = [] for mapfile in getFiles(btmapdir, include='unique'): print '- Loading', mapfile fh = open(mapfile) count = 1 for line in fh: if count % 1000000 == 0: print ' reached', count count+=1 try: lst = line[:-1].split('\t') chrom = lst[2] if chrom == '*': continue for x in map(lambda x: A2Q[x], lst[10]): errorsum += x ecount += 1 # exception if a line beginning with @ except IndexError: pass except ValueError: pass fh.close() print 'Mean error over %s positions is'%(ecount), errorsum/float(ecount), 'or', Q2P(errorsum/float(ecount)) print sys.exit() # DONE # ==================================================== # Part III: Identify SNPs using bayesian SNP detection # ==================================================== if SNPCALLING: print 'PART III: Genotyping' if not bfaref: sys.exit('Reference genome must be specified using --ref/-r ') createdir(resdir); createdir(snpdir); createdir(tmpdir) sampleIDs = sorted(seqRuns.keys()) newSampleIDs = [] if len(singlesampleid): for sid in singlesampleid: if sid in sampleIDs: newSampleIDs += [sid] else: sys.exit('ID specified by --sampleid ID not found in map.') if len(newSampleIDs): sampleIDs = newSampleIDs; del newSampleIDs # Only load reference if re-estimating... if GR == None: NEEDTOLOADREFERENCE = True for sampleID in sampleIDs: motherfile = snpdir+sampleID+'.mother' # genotype at all informative positions motherfile += boundslabel+'.txt' if not os.access(motherfile, os.F_OK) or REDOSNPCALLING: NEEDTOLOADREFERENCE = True if NEEDTOLOADREFERENCE: sys.stdout.write(' - Loading reference genome... '); sys.stdout.flush() GR = readFasta(bfaref, TOUPPER=True) flg = sum(map(lambda x: len(x['seq']), GR.values())) sys.stdout.write('N=%s loci\n'%(flg)) thesnpdir = snpdir if DRYRUN: thesnpdir = tmpdir def dummyLooper(jobs=[]): for kwargs in jobs: job = threading.Thread(target=sniper, kwargs=kwargs) # job = multiprocessing.Process(target=sniper, kwargs=kwargs) job.run() return 1 # which sampleIDs do we actually need to execute on? sids = [] mofs = {} noms = {} for sid in sampleIDs: motherfile = thesnpdir+sid+'.mother'+boundslabel+'.txt' nominalfile = thesnpdir+sid+'.nominal'+boundslabel+'.txt' if not os.access(motherfile, os.F_OK) or REDOSNPCALLING: print ' - Genotyping sample %s...'%(sid) sids += [sid] mofs[sid] = motherfile noms[sid] = (SAVENOMINAL and nominalfile) or '' else: print ' - Already processed/processing sample %s: SKIP'%(sid) jpt = int(math.ceil(len(sids)/float(nthreads))) threadJobs = [[] for i in range(nthreads)] ct = 0 for sampleID in sids: # set up SNP calling for PE vs PE+SE / unique reads vs unique + deg reads pref = btmapdir+sampleID rf = {'PEuni':pref+'.PE.unique.map', 'PEdeg':'', 'SEuni':'', 'SEdeg':''} if USESINGLETONS: rf['SEuni'] = pref+'.SE.unique.map' if not UNIQ and not BEST: rf['PEdeg'] = pref+'.PE.degenerate.map' rf['SEdeg'] = pref+'.SE.degenerate.map' else: if not UNIQ and not BEST: rf['PEdeg'] = pref+'.PE.degenerate.map' # NO LONGER NECESSARY - THIS IS NOW GENERALIZED # estimate read length from the first line of the map file # if not READLENBYUSER: # rls = [] # for foo in rf.keys(): # if len(rf[foo]): # foofh = open(rf[foo]) # for line in foofh: # try: # lst = line[:-1].split('\t') # chrom = lst[2] # if chrom == '*': continue # rls += [len(lst[9])] # break # # exception if a line beginning with @ # except IndexError: pass # except ValueError: pass # foofh.close() # # if len(unique(rls)) == 1: # readlen = rls[0] # print ' - Detected read length of %s nt.'%(readlen) # else: # sys.exit('Multiple read lengths detected in read map files. User must specify proper length using --readlength flag.') # kwargs = {'readfiles':rf, 'GR':GR, 'saveall':mofs[sampleID], 'outfile':noms[sampleID], 'expPhred':expPhred, 'maxreadlen':maxreadlen, 'Lambda':Lambda, 'prior':prior, 'theta':theta, 'tmpdir':tmpdir, 'bounds':seqbounds, 'LIPieces':LIPieces} if len(threadJobs[ct]) < jpt: threadJobs[ct] += [kwargs] else: ct += 1; threadJobs[ct] += [kwargs] for group in threadJobs: job = threading.Thread(target=dummyLooper, kwargs={'jobs':group}) # job = multiprocessing.Process(target=dummyLooper, kwargs={'jobs':group}) job.start() while threading.activeCount() > 1: time.sleep(1) for sampleID in sampleIDs: print ' - Identifying significant SNPs for %s...'%(sampleID) motherfile = thesnpdir+sampleID+'.mother' # genotype at all informative positions motherfile += boundslabel+'.txt' # nominalfile = None # if SAVENOMINAL: # nominalfile = thesnpdir+sampleID+'.nominal' # nominally significant snp variants # nominalfile += boundslabel+'.txt' # first check if we need to recompute anything checkups = [] for minPvalue in QvalueRange: # multiple-test corrected snp variants snpfile = thesnpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue) if os.access(snpfile, os.F_OK) and not REDOFINALSNPCALLING: continue checkups += [(minPvalue,snpfile)] checkups.sort() qccheck = lambda line: len(line[:-1].split('\t'))==11 and True infh = None if len(checkups)>0: # first get total number of hypotheses from mother and store data print ' - Counting number of genomic positions evaluated from mother SNP file...' try: infh = open(motherfile) except IOError: print 'Cannot access:', motherfile, '...SKIP' if infh: data = filter(qccheck, infh) infh.close() numHypos = len(data) # if mother file contained no information if numHypos == 0: numHypos = 1 print ' - There are %s hypotheses...'%(numHypos) refilt = {} if REDUNDANCYFILTER and os.access(REDUNDANCYFILTER, os.F_OK): # print 'REFILTERING' fh = open(REDUNDANCYFILTER) for line in fh: entries = map(lambda x: x.split(','), line[:-1].split('\t')) # what are all targets offset = 0 tlist = map(lambda x: [x[0],int(x[1])+offset], entries) for c,p,l in entries: refilt[c,int(p)+offset] = tlist fh.close() # now we can post-filter the significant set # based on the actual number of hypotheses we made for Pvalue,snpfile in checkups: pipeit(' - Q%s...'%(Pvalue)) # generate list of significant genotypes snpfile = thesnpdir+sampleID+boundslabel+'.Q%s.txt'%(Pvalue) outfh = open(snpfile, 'w') print >>outfh, '# Total positions tested: %s'%(numHypos) keepers = [] jointmaybes = {} for i in xrange(len(data)): row = data[i][:-1].split('\t') tchrm = row[0]; tpos = int(row[1]) Q = (row[4]==nastr or row[4]=='NA' and nastr) or float(row[4]) ref = row[2]; geno = row[3]; alpha = float(row[8]) genoref = (prior != 'haploid' and iupac2nts(ref)) or ref if genoref != geno and Q != nastr: # check for SNP candidacy ISCANDIDATE = False if (not MAXALPHA and Q >= Pvalue) or (MAXALPHA and alpha <= MAXALPHA and alpha >= MINALPHA and Q >= Pvalue): ISCANDIDATE = True elif POSTPROCESSING: # basically, if we have high depth and at most 10:1 deg to unique, # then ignore the quality scores at this position # moreover if AAFD is close to 50:50 or 100:0 but MAP genotype does not # reflect this, then fix the genotype AAFD = sorted(map(lambda y: (float(y[1]),y[0]), map(lambda x: x.split(':'), row[10].split(',')))) depth = int(row[5]) alpha = float(row[8])+1e-10 # basically this says ignore the P-value if the depth and degeneracy are ok # realistically this is just like using maq except we have degeneracy values if (depth >= FDEPTH[0] and depth/alpha >= FDEG[0] and Q >= FQ[0]) or (depth >= FDEPTH[1] and depth/alpha >= FDEG[1] and Q >= FQ[1]): ISCANDIDATE = True if POSTFILTER: depth = int(row[5]); ddeg = int(row[6]) if float(ddeg)/depth > DDEGCUTOFF: ISCANDIDATE = False # find SNPs that have template redudndancy at this Pvalue if ((not MAXALPHA and Q >= Pvalue) or (MAXALPHA and alpha <= MAXALPHA and alpha >= MINALPHA and Q >= Pvalue)) and (tchrm,tpos) in refilt: ISCANDIDATE = False jointmaybes[(tchrm,tpos)] = [row,i] # FINALLY SAVE SNP IF STILL GOOD if ISCANDIDATE: row[9] = 'Y' print >>outfh, '\t'.join(row) # since this is largest Pvalue, we continue filtering rows... # this will accelerate future scans keepers += [data[i]] # update keepers with subset of jointmaybes # this is a list of the positions - cluster these into their redundancy groups # then choose the best 1 from each group candfile = thesnpdir+sampleID+boundslabel+'.candidate.Q%s.txt'%(Pvalue) grouping = {} groupkeeps = 0 # at each position, check for snps at redundant positions for tchrm,tpos in jointmaybes.keys(): # first, we want to isolate positions with the same snp call tmpdct = {} # group redundant snps by genotype for x in refilt[(tchrm,tpos)]: try: tmp = jointmaybes[tuple(x)] entry = (int(tmp[0][4]), float(tmp[0][8]), tmp) try: tmpdct[tmp[0][3]] += [entry] except KeyError: tmpdct[tmp[0][3]] = [entry] except KeyError: pass # treat each genotype separately - we want to avoid replicate calls for cons,tmpobj in tmpdct.iteritems(): tmplist = tmpobj[:] # list of positions with same genotype q = None; row = None; i = None # group calls by quality value qdct = {} for q,alpha,(row,i) in tmplist: try: qdct[q] += [(alpha,row,i)] except KeyError: qdct[q] = [(alpha,row,i)] maxq = sorted(qdct.keys())[-1] # get maximum q group # group snps of max quality by alpha adct = {} for alpha,row,i in qdct[maxq]: try: adct[alpha] += [(row,i)] except KeyError: adct[alpha] = [(row,i)] # keep the snp if there is a single best alpha minalpha = sorted(adct.keys())[0] tmplist = [] for ttlist in adct.values(): tmplist += ttlist # temporarily disabled if len(tmplist) == 1: row,i = tmplist[0] row[9] = 'Y' print >>outfh, '\t'.join(row) keepers += [data[i]] groupkeeps += 1 # else: # # return the remainder # if len(tmplist): print >> outfh2, tmplist nsig = len(keepers) data = keepers outfh.close() pipeit(': %s/%s significant SNP variants at Q > %s (P < %s).'%(nsig, numHypos, Pvalue, 10**(-Pvalue/10.)),1) os.system('rm -rf "%s"'%(tmpdir)) print '\nResults saved to %s'%(thesnpdir) print 'DONE PART III: GENOTYPING.\n' # ==================================================== # Part IV: Diagnostics # ==================================================== if DIAGNOSTICS: print 'PART IV: DIAGNOSTICS...' # use the alloutfile results from SNP calling to gather diagnostics # including the following: # number of reference positions for which we have no read data # histogram of total positions with x read depth # histogram of posterior probabilities for all positions and significant positions stats = { 'Ns':{}, 'numHypos':{}, 'Qns':{}, 'DH':{}, 'Davg':{}, 'alpha':{}} desc = {'sigG':'Distribution of significant genotypes', 'Gtype':'Distribution of significant categorical genotypes', 'Ns': 'Reference positions with no read data', 'numHypos': 'Positions with no information (depth=0 or ref=N)', 'Qns': 'Distribution of all posterior probabilities', 'Qs': 'Distribution of significant posterior probabilities', 'DH': 'Distribution of genome-wide read depth', 'Davg': 'Genome-wide average read depth (avg. depth per position)', 'sigDavg': 'Average read depth (avg. depth per position) for significant positions', 'alpha': 'Genome-wide degenerate/unique alignment ratio statistics', 'sigalpha': 'Significant SNP degenerate/unique alignment ratio statistics'} # finish initializing sig entries Pstats = {} for minPvalue in map(str,QvalueRange): Pstats[minPvalue] = {'sigG':{}, 'Qs':{}, 'sigDavg':{}, 'sigalpha':{}, 'Gtype':{}} diagfile = resdir+'Summary-diagnostics' diagfile += ubamap if POSTPROCESSING: diagfile += '-rcf' diagfile += '.txt' fh = open(diagfile, 'w') fh.close() for sampleID in sorted(seqRuns.keys()): motherfile = snpdir+sampleID+'.mother.txt' Ns = {} numHypos = {} # number of positions with information, by chromosome DH = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0, 40:0, 60:0, 80:0, 100:0, 200:0, 300:0, 400:0, 500:0} DHints = sorted(DH.keys()) Dsum = 0; Dcount = 0; allDvals = [] alphasum = 0; allalpha = [] Qns = {0:0, 5:0, 10:0, 15:0, 20:0, 25:0, 30:0, 35:0, 40:0, 44:0, 50:0, 55:0, 60:0, 65:0, 70:0, 75:0, 80:0, 85:0, 90:0, 95:0, 100:0, 105:0, 110:0, 115:0, 120:0, 125:0, 130:0, 135:0, 140:0, 145:0, 150:0, 155:0, 160:0} Qints = sorted(Qns.keys()) print ' - Collecting statistics from %s mother file...'%(sampleID) fh = open(motherfile) for line in fh: row = line[:-1].split('\t') if len(row) != 11: continue C = row[0] P = int(row[1]) R = row[2] G = row[3] Q = row[4]=='NA' and row[4] or int(row[4]) D = int(row[5]) M = int(row[6]) TM = int(row[7]) alpha = float(row[8]) S = row[9] # Y/N Dsum += D Dcount += 1 allDvals += [D] # ratio of unique alignments at this position against total degenerate alignments alphasum += alpha allalpha += [alpha] try: numHypos[C] += 1 except KeyError: numHypos[C] = 1 try: DH[D] += 1 except KeyError: # then it is between intervals idx = 0 while idx < len(DHints) and D>DHints[idx]:idx += 1 if idx == len(DHints): idx -= 1 DH[DHints[idx]] += 1 if G == 'NN': try: Ns[C] += 1 except KeyError: Ns[C] = 1 if Q != 'NA': try: Qns[Q] += 1 except KeyError: try: idx = 0 while Q>Qints[idx]: idx += 1 Qns[Qints[idx]] += 1 except IndexError: Qns[Qints[len(Qints)-1]] += 1 fh.close() allalpha = filterstr(nastr)(map(float,allalpha)) allDvals = filterstr(nastr)(map(float,allDvals)) # save genome-wide statistics for this sampleID stats['Ns'][sampleID] = Ns stats['numHypos'][sampleID] = numHypos stats['Qns'][sampleID] = Qns stats['DH'][sampleID] = DH stats['Davg'][sampleID] = (divna(Dsum,Dcount), sd(allDvals), getQuants(allDvals)) stats['alpha'][sampleID] = (divna(alphasum,Dcount), sd(allalpha), getQuants(allalpha)) # collect stats for significant snps for minPvalue in QvalueRange: snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue) sigG = {}; Gtype = {'h0':0, 't1':0, 'h2':0, 't2':0} sigDsum = 0; sigDcount = 0; sigDvals = [] sigalphasum = 0 allsigalpha = [] Qs = {0:0, 5:0, 10:0, 15:0, 20:0, 25:0, 30:0, 35:0, 40:0, 44:0, 50:0, 55:0, 60:0, 65:0, 70:0, 75:0, 80:0, 85:0, 90:0, 95:0, 100:0, 105:0, 110:0, 115:0, 120:0, 125:0, 130:0, 135:0, 140:0, 145:0, 150:0, 155:0, 160:0} print ' - Collecting statistics from significant SNPs file at FWER < %s...'%(minPvalue) # now pass through significant snps fh = open(snpfile) for line in fh: row = line[:-1].split('\t') if len(row) != 11: continue C = row[0] P = int(row[1]) R = row[2] G = row[3] Q = row[4]=='NA' and row[4] or int(row[4]) D = int(row[5]) M = int(row[6]) TM = int(row[7]) alpha = float(row[8]) S = row[9] # Y/N gg = list(G) gj = ''.join(sorted(gg)) try: sigG[gj] += 1 except KeyError: sigG[gj] = 1 # homo/het/triallele if gg[0]==gg[1]: if gg[0]==R: Gtype['h0'] += 1 else: Gtype['h2'] += 1 else: if gg[0]!=R and gg[1]!=R: Gtype['t2'] += 1 else: Gtype['t1'] += 1 try: Qs[Q] += 1 except KeyError: try: idx = 0 while Q>Qints[idx]: idx += 1 Qs[Qints[idx]] += 1 except IndexError: Qs[Qints[len(Qints)-1]] += 1 sigalphasum += alpha allsigalpha += [alpha] sigDsum += D sigDcount += 1 sigDvals += [D] fh.close() # save significant statistics for this sampleID at Pvalue PP = str(minPvalue) Pstats[PP]['Qs'][sampleID] = Qs Pstats[PP]['sigG'][sampleID] = sigG Pstats[PP]['Gtype'][sampleID] = Gtype Pstats[PP]['sigDavg'][sampleID] = (divna(sigDsum,sigDcount), sd(sigDvals), getQuants(sigDvals)) Pstats[PP]['sigalpha'][sampleID] = (divna(sigalphasum, sigDcount), sd(allsigalpha), getQuants(allsigalpha)) # print out a summary diagnostics file fh = open(diagfile, 'a') for k in sorted(stats.keys()): # print >>fh, 'Diagnostic:', desc[k] print >>fh, desc[k] for sid in sorted(stats[k].keys()): info = None # if k == 'Davg' or k == 'sigDavg': # info = str(round(stats[k][sid],3)) if k == 'alpha' or k == 'Davg': tags = ('average', 'sd', 'quantiles') info = [str(roundna(3)(stats[k][sid][0])), str(roundna(3)(stats[k][sid][1])), stats[k][sid][2]] info = ', '.join([tags[i]+': '+info[i] for i in range(len(info))]) print >>fh, '\t'.join([sampleID, k, info]) else: info = ', '.join(map(lambda x: ':'.join(map(str, x)), sorted(stats[k][sid].items()))) print >>fh, '\t'.join([sampleID, k, info]) print >> fh print >> fh, 'Statistics for significant SNPs' print >> fh tags = ('average', 'sd', 'quantiles') for P in map(str,QvalueRange): for k in sorted(Pstats[P].keys()): info = None if k == 'sigalpha' or k == 'sigDavg': info = [str(roundna(3)(Pstats[P][k][sid][0])), str(roundna(3)(Pstats[P][k][sid][1])), Pstats[P][k][sid][2]] info = ', '.join([tags[i]+': '+info[i] for i in range(len(info))]) # print >>fh, '\t'.join([sampleID, str(P), k, info]) elif k == 'Qs' or k=='sigG' or k=='Gtype': info = ', '.join(map(lambda x: ':'.join(map(str, x)), sorted(Pstats[P][k][sid].items()))) print >>fh, '\t'.join([sampleID, str(P), k, info]) print >> fh fh.close() print 'DONE PART IV: DIAGNOSTICS.\n' # ================ # Part V: ASSEMBLY # ================ if SAVEASSEMBLY: print 'PART V: Building new genome assembly.' createdir(phylodir); createdir(findir) sortedSampleIDs = sorted(seqRuns.keys()) # master copy # G = readFasta(bfaref, TOUPPER=True) if 1: # create a new assembly sequence including novel significant SNPs for sampleID in sortedSampleIDs: for minPvalue in QvalueRange: snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue) print ' - Saving new assembly genome...', sampleID, 'at Q >', minPvalue # load reference genome - G will be editied by new SNPs Gtmp = readFasta(bfaref, TOUPPER=True) # load significant SNPs # by right, this should be processed line by line if os.access(snpfile, os.F_OK): dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False) dct = {} for i in range(len(dersnps)): # row = dersnps[i][0] if dersnps[i][0][0] != '#': chrom = dersnps[i][0] pos = int(dersnps[i][1]) refsnp = dersnps[i][2] cons = dersnps[i][3] allele = nts2iupac(list(cons)).upper() try: dct[chrom] += [(pos,allele)] except KeyError: dct[chrom] = [(pos,allele)] # edit reference genome using these SNPs for chrom in dct.keys(): seq = list(Gtmp[chrom]['seq']) # change significant SNP positions try: for pos,allele in dct[chrom]: seq[pos] = allele except IndexError: print 'Can\'t fetch position: %s,%s | seq length is only %s'%(chrom,pos,len(seq)) Gtmp[chrom]['seq'] = ''.join(seq) printFasta(Gtmp, findir+sampleID+boundslabel+'.Q%s.fasta'%(minPvalue)) else: print ' - Cannot access snp file:', snpfile # save concatenated string of the called derived SNPs for each line # need to know ALL positions of SNPs across lines to get equal length multiple alignment for minPvalue in QvalueRange: cpd = {} # chrom,pos dict: keyed by chrom,pos pairs returning the ref snp at that position cpdhits = {} # record positions for all SNPs across all samples for sampleID in sortedSampleIDs: print ' - Generating SNP string...', sampleID, 'at Q >', minPvalue snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue) if os.access(snpfile, os.F_OK): dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False) for i in xrange(len(dersnps)): if dersnps[i][0][0] != '#': chrom = dersnps[i][0] pos = int(dersnps[i][1]) refsnp = dersnps[i][2].upper() cpd[(chrom,pos)] = refsnp # store this as a position of interest # record number of lines with a SNP here try: cpdhits[(chrom,pos)] += 1 except KeyError: cpdhits[(chrom,pos)] = 1 # now we have all the positions that are hit by some line # save a concatenated snp string skeys = cpd.keys(); skeys.sort() # sorting same for all taxa # skeys = [(chrom,pos) for (chrom,pos) in sorted(cpd.keys())] # sorting same for all taxa snpstrings = [] keepsids = [] # final list of sampleIDs (confirmed SNP file access) # first add the reference string snpstrings += [''.join(map(lambda k: cpd[k], skeys))] # cpd points to reference allele keepsids += ['Reference'] # add alleles from a second external file if specialReferenceFiles: for srf in specialReferenceFiles.split(','): G2 = readFasta(srf, TOUPPER=True) snpstrings += [''.join([G2[chrom]['seq'][pos] for (chrom,pos) in skeys])] srname = srf.split('/')[-1].split('.')[0] keepsids += [srname] del G2 for sampleID in sortedSampleIDs: snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue) if os.access(snpfile, os.F_OK): keepsids += [sampleID] dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False) tmp = {} # record SNP positions for this sample for i in xrange(len(dersnps)): if dersnps[i][0][0] != '#': chrom = dersnps[i][0] pos = int(dersnps[i][1]) dersnp = nts2iupac(list(dersnps[i][3])).upper() tmp[(chrom,pos)] = dersnp snpstr = '' for cp in skeys: # recall, this is list of ALL SNP positions across samples refsnp = cpd[cp] # if derived snp in this line, use it, otherwise use the reference if cp in tmp: snpstr += tmp[cp] else: snpstr += refsnp snpstrings += [snpstr] fh = open(phylodir+sampleID+boundslabel+'.SNPstring.Q%s.txt'%(minPvalue), 'w') print >> fh, snpstr fh.close() # now save a multi-fasta and phylip file at this stringency level snpfasta = list2fasta(snpstrings, ids=keepsids) printFasta(snpfasta, file=phylodir+'SNPstring-P%s.fasta'%(minPvalue)) fasta2Phylip(snpfasta, file=phylodir+'SNPstring-P%s.phylip.txt'%(minPvalue), infile='', verbose=False) print 'DONE PART V: BUILDING NEW GENOME ASSEMBLY.\n' print '\nDONE SNIPER.PY'