#!/usr/bin/env python

"""sniper.py: SNP Identification using Probability of Every Read. 
 
 Version: 1.0 (unix/mac)
 
 Daniel F. Simola (simola@mail.med.upenn.edu)
 Laboratory of Junhyong Kim
 University of Pennsylvania
 25 January 2011
 
 Copyright (c) 2011, Daniel F. Simola and Junhyong Kim, University of
 Pennsylvania.  All Rights Reserved.
 
 You may not use this file except in compliance with the terms of our
 License. You may obtain a copy of the License at http://kim.bio.upenn.edu/software/
 
 Unless required by applicable law or agreed to in writing, this
 software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied.  See the License
 for the specific language governing permissions and limitations
 under the License.
"""

import os, re, sys, math, time, stat, threading, time, random
from operator import add,mul
from string import *

nastr = '"NA"' # represents missing value


# LOCAL METHODS
# -----------------------------------------------------------------
# -----------------------------------------------------------------
fixNull = lambda nastr: lambda x: x=='' and nastr or x

SGr = {} # global current read pileup dictionary
# helper function for pileup updating
def delete(x):
	global SGr
	del SGr[x]
	return 1


def pipeit(s, nl=0, pipe=sys.stdout):
	pipe.write(s+(nl==1 and '\n' or '')); sys.stdout.flush()


def isnan(x):
    return str(x) == str(1e400*0)

def unique(lst):
	"""Returns a subset of lst containing the unique elements."""
	d = {}
	for i in lst:
		if i not in d: d[i] = 0
		else: d[i] += 1
	e = d.keys(); e.sort()
	return e

def maxlenval(tup):
	mlv = 0
	for i in tup:
		x = len(str(i))
		if x>mlv:mlv = x
	return mlv

def fixspacing(item,nsp):
	v = str(item)
	if len(v) == nsp: return v
	delt = nsp-len(v)
	if delt < 0:
		v = v[0:nsp]
	else:
		for i in range(delt): v+=' '
	return v

def transpose(mat):
	"""Given a 2d array of arrays, will invert the dimensions."""
	newm = []
	for c in range(len(mat[0])):
		newrow = []
		for r in range(len(mat)):
			newrow.append(mat[r][c])
		newm.append(newrow)
	return newm

def flatten(sequence):
	def rflat(seq2):
		seq = []
		for entry in seq2:
			if seqin([entry]):
				seq.extend([i for i in entry])
			else:
				seq.append(entry)
		return seq
	
	def seqin(sequence):
		for i in sequence:
			## all sequences have '__contains__' in their dir()
			## parentheses present to aid commenting mid-condition
			if ('__contains__' in dir(i) and type(i) != str and type(i) != dict):
				return True
		return False
	
	seq = [sequence][:] # in case parameter isn't already a sequence
	while seqin(seq): seq = rflat(seq)
	return seq

def createdir(dirname):
	if not dirname: return 0
	if not os.access(dirname, os.F_OK):
		try: os.mkdir(dirname); return 1
		except OSError: pass
	return 0

def record(astr, filename, ioMethod='a'):
	if ioMethod == 'a':
		assert os.access(filename, os.F_OK), 'Cannot access '+filename
	
	pmrfh = open(filename, ioMethod)
	print >>pmrfh, astr
	pmrfh.close()
	
	return 1

def trimComments(dat):
	# parse comments
	ndat = []
	for line in dat:
		# only care about non commented lines
		if not re.match('(^\s*#.*$)|(^\s$)', line):
			# parse any commentary on this line
			cleanme = re.split('^(.*)#.*$', line)
			if len(cleanme) > 1: cleanme.pop(0)
			ndat.append(cleanme[0].strip('\n'))
	return ndat

def getFiles(indirfile, include=[], exclude=[]):
	if type(include)!=type(list()): include = [include]
	if type(exclude)!=type(list()): exclude = [exclude]
	return getContents(indirfile, typ='files', include=include, exclude=exclude)

def getDirectories(indirfile, include=[], exclude=[]):
	if type(include)!=type(list()): include = [include]
	if type(exclude)!=type(list()): exclude = [exclude]
	files = getContents(indirfile, typ='dirs', include=include, exclude=exclude)
	return map(slash, files)

def getContents(indirfile, typ='all', include=[], exclude=[]):
	if type(include)!=type(list()): include = [include]
	if type(exclude)!=type(list()): exclude = [exclude]
	keepfiles = []
	if len(include) == 0: include = ['']
	try:
		mode = os.stat(indirfile)[stat.ST_MODE]
		INISADIR = False
		if stat.S_ISDIR(mode):
			INISADIR = True
			indirfile = slash(indirfile)
		files = []
		if INISADIR and os.access(indirfile, os.F_OK):
			for f in os.listdir(indirfile):
				# print indirfile, 'vs', f
				# if f[0] != '.' and f[0] != '~' and f[len(f)-3:len(f)] not in exclude:
				if f[0] != '.' and f[0] != '~':
					# do includes
					for item in include:
						if re.match('.*%s.*'%(item), f):
							# confirm not a directory
							mode = os.stat(indirfile+f)[stat.ST_MODE]
							if typ=='all': 
								if stat.S_ISDIR(mode): files.append(indirfile+f)
								else: files.append(indirfile+f)
							elif typ=='files' and not stat.S_ISDIR(mode):
								files.append(indirfile+f)
							elif typ=='dirs' and stat.S_ISDIR(mode):
								files.append(indirfile+f)
							# else: print indirfile+f, 'is a directory'
			for f in files:
				inexclude = False
				for item in exclude:
					if re.match('.*%s.*'%(item), f): inexclude = True
				if inexclude == False:
					# confirm not a directory
					keepfiles.append(f)
					
					# try:
					# 	print 'testing', indirfile+f
					# 	mode = os.stat(indirfile+f)[stat.ST_MODE]
					# 	print 'mode', mode
					# except OSError:
					# 	print 'why?'
					# if typ=='all': 
					# 	if stat.S_ISDIR(mode): keepfiles.append(indirfile+f+'/')
					# 	else: keepfiles.append(indirfile+f)
					# elif typ=='files' and not stat.S_ISDIR(mode):
					# 	keepfiles.append(indirfile+f)
					# elif typ=='dirs' and stat.S_ISDIR(mode):
					# 	print 'got a dir'
					# 	keepfiles.append(indirfile+f+'/')
					# else:
					# 	print 'failure', stat.S_ISDIR(mode)
		elif not INISADIR: keepfiles.append(indirfile)
		else: sys.exit('cannot access dir '+indirfile)
	except OSError: pass
	return keepfiles

def printList(lst, title="", delim="\n", pipe=sys.stdout, file='', sort=False, newline='\n', ioMethod='w'):
	if file: pipe = open(file,ioMethod)
	if sort: lst.sort()
	if title: 
		lst = [title]+lst#print >> pipe, title
	for i in range(len(lst)):
		if i == len(lst)-1:
			print >> pipe, str(lst[i]),
		else:
			print >> pipe, str(lst[i])+delim,
	if newline: print >> pipe, newline,
	if file: pipe.close()

def readTable(filename, header=True, rownames=True, delim="\t", comments=True, keepKey=False):
	fh = open(filename)
	table = fh.readlines()
	fh.close()
	if comments: table = trimComments(table)
	
	tabHeader = []; rowIDs = []; data = []
	if header: 
		tabHeader = table.pop(0).strip('\n').split(delim)
		if not keepKey: tabHeader.pop(0)
		# else: tabHeader = table.pop(0).strip('\n').split(delim)
	for line in table:
		sline = line.strip('\n').split(delim)
		if rownames:
			rowIDs.append(sline.pop(0))
		data += [sline]
	return data, rowIDs, tabHeader

def printTable(tab, header=[], rows=[], delim="\t", newline='\n', file='', pipe=sys.stdout, ioMethod='w'):
	
	if file: pipe = open(file,ioMethod)
	
	if header:
		printList(header, delim=delim, pipe=pipe, newline=newline)
	if len(rows):
		assert len(rows) == len(tab), "Rows and tab must have same length."
		for ri in range(len(tab)):
			r = [rows[ri]]+tab[ri]
			printList(r, delim=delim, pipe=pipe, newline=newline)
	else:
		for r in tab:
			printList(r, delim=delim, pipe=pipe, newline=newline)
	
	if file: pipe.close()

def printFormattedTable(tab, header=[], rows=[], delim=" ", newline='\n', file='', pipe=sys.stdout, nastr='"NA"', ioMethod='w', colSpacing=[]):
	if file: pipe = open(file,ioMethod)
	if tab == [] and file: pipe.close(); return
	tab2 = tab[:]
	if rows:
		for i in range(len(tab)): tab2[i] = [rows[i]]+tab[i]
	
	# need to determine for each column the max length value and space to that
	collens = colSpacing[:]
	if not len(collens):
		# print 'printformattedtable: fixing col spacing'
		temp = transpose(tab2)
		if len(header):
			for i in range(len(temp)):
				r = temp[i]
				r.append(header[i])
				collens.append(maxlenval(r))
		else:
			for r in temp: collens.append(maxlenval(r))
	
	# now create a new tuple with proper spacing
	temp = []; sheadings = []
	if len(header):
		for i in range(len(header)):
			sheadings.append(fixspacing(header[i],collens[i]))
	
	for row in tab2:
		newrow = []
		for i in range(len(row)):
			val = row[i]==nastr and '.' or row[i]
			#if val==nastr: val = '.' # SAS missing value
			newrow.append(fixspacing(val,collens[i]))
		temp.append(newrow)
		
	# pipe = open(file,ioMethod)
	printTable(temp,header=sheadings,pipe=pipe,delim=delim,newline=newline)
	if file: pipe.close()

def nts2iupac(nts):
	X = {'A':'A', 'T':'T', 'C':'C', 'G':'G', 'N':'N', 'U':'U', '-':'-'}
	
	X['GT'] = 'K'
	X['AC'] = 'M'
	X['CGT'] = 'B'
	X['ACG'] = 'V'
	X['CG'] = 'S'
	X['AT'] = 'W'
	X['AGT'] = 'D'
	X['CT'] = 'Y'
	X['AG'] = 'R'
	X['ACT'] = 'H'
	
	X['AA'] = 'A'
	X['TT'] = 'T'
	X['CC'] = 'C'
	X['GG'] = 'G'
	
	nts = ''.join(sorted(map(lambda x: x.upper(), nts)))
	return X[nts]
	
	
def iupac2nts(a,double=True):
	X = {'A':'A', 'T':'T', 'C':'C', 'G':'G', 'N':'N', 'U':'U', '-':'-'}
	
	X['K'] = 'GT'
	X['M'] = 'AC'
	X['B'] = 'CGT'
	X['V'] = 'ACG'
	X['S'] = 'CG'
	X['W'] = 'AT'
	X['D'] = 'AGT'
	X['Y'] = 'CT'
	X['R'] = 'AG'
	X['H'] = 'ACT'
	
	if double:
		X['A'] = 'AA'
		X['T'] = 'TT'
		X['C'] = 'CC'
		X['G'] = 'GG'
	
	return X[a.upper()]

def readFasta(faname, usealtid=False, split='^>', idPat='', addins=[], TOUPPER=False, VERBOSE=False):
	"""Parses a multiple fasta file into a dictionary object keyed by fasta identifier. 
	Option to key the dictionary by an alternative id found in the fasta header: altid=True|False.
	If directory of several fasta files is provided as faname, loads them into single dictionary.
	"""
	fadict = {} # return dictionary of fasta entries
	faj = '' # load all information into single data string, faj
	files = getFiles(faname)
	
	for f in files:
		if VERBOSE: print '- Loading %s'%(f)
		fh = open(f)
		fa = fh.readlines()
		fh.close()
		# remove comment lines
		faj += ''.join(filter(lambda x: not re.match(' *#.*', x) and True, fa))
	
	# parse by entry (>)
	getentries = re.compile(split, re.MULTILINE)
	faentries = re.split(getentries, faj)
	
	# for some reason the first entry is the null string
	faentries.pop(0)
	
	# parse individual entries
	for entry in faentries:
		# first  has format >name other-header-info
		(header, seq) = entry.split("\n", 1)
		# trim off the '>' character
		theid = ""
		altid = ""
		info = ""
		# further split up the info - for use of alternative identifier
		pattern = '^(\S+) (.*)$'
		if re.match(pattern, header):
			(crap, theid, info, crap) = re.split(pattern, header)
			info = info.strip(' ').strip('\t')
			if len(info) > 0:
				#print "\""+info+"\""
				(crap, altid, info, crap) = re.split('(\S+)(.*)', info)
				info = info.strip(' ').strip('\t')
		else:
			theid = header
			altid = ""
		
		# remove newlines so the sequence data is contiguous
		seq = re.sub("\n", "", seq)
		# remove terminal whitespace
		seq = seq.strip(' ')
		
		if idPat != '':
			crap, theid, crap = re.split(idPat, theid)
			#print 'theid', theid
		
		for pre,suf,pat in addins:
			if re.match(pat, theid):
				# print 'theid', theid, 'pat', pat
				theid = pre+theid+suf
				# print 'new', theid
		
		if TOUPPER:
			seq = seq.upper()
		
		# build the entry
		# add a little check to see if there are repeat entries
		if fadict.has_key(theid):
			print theid, ": This key has already been used to store another fasta entry!"
		else:
			# build the entry for this id
			# allow choice of primary dict key
			if theid and altid and usealtid:
				#print "OK", theid, altid
				fadict[altid] = {'id':theid, 'altid':altid, 'info': info, 'seq': seq}
			else:
				fadict[theid] = {'id':theid, 'altid':altid, 'info': info, 'seq': seq}
	return fadict

def list2fasta(seqs=[], ids=[]):
	dct = dict()
	uid = 0
	for i in range(len(seqs)):
		theid = str(uid)
		try: theid = ids[i]
		except IndexError: pass
		dct[theid] = {'id':theid, 'seq':seqs[i], 'info':''}
		uid += 1
	return dct

def printFasta(fadict, file='', pipe=sys.stdout, key='seq', usealtid=False):
	"""Print a dictionary of fasta entries to a file in mfasta format"""
	keys = fadict.keys()
	keys.sort()
	
	if file: pipe = open(file,'w')
	
	for theid in keys:
		if 'altid' in fadict[theid] and not usealtid:
			header = '>'+theid+' '+fadict[theid]['altid']+' '+fadict[theid]['info']
		elif 'id' in fadict[theid]:
			header = '>'+theid+' '+fadict[theid]['id']+' '+fadict[theid]['info']
		else:
			header = '>'+theid+' '+fadict[theid]['info']
		print >> pipe, header
		print >> pipe, fadict[theid][key]
		print >> pipe
	
	if file: pipe.close()

def fasta2Phylip(dct={}, file='phylipi.txt', infile='', verbose=False):
	"""Convert a fasta file into Phylip interleaved format."""
	# print infile
	d = dct
	
	if infile: d = readFasta(infile)
	
	names = d.keys()
	# fix length
	maxlen = 10+1
	shorts = ['' for i in range(len(names))]
	for i in range(len(names)): 
		shorts[i] = fixspacing(names[i], maxlen)
	blank = fixspacing('',maxlen)
	taxa = len(names)
	# print d
	# print 'test', d[d.keys()[0]]
	alilen = len(d[d.keys()[0]]['seq'])
	seqs = ['' for i in range(taxa)]
	
	if verbose: print shorts
	if verbose: print taxa, alilen
	if verbose: print map(len, seqs)
	enc = 'ascii'
	for i in range(taxa): seqs[i] = d[names[i]]['seq'].encode(enc)
	
	# print out an interleaved phylip formatted file
	pos = 0
	fh = open(file, 'w')
	# alilen = 715
	fh.write(str(taxa)+' '+str(alilen)+'\n'.encode(enc))
	
	for i in range(taxa):
		fh.write(shorts[i]+seqs[i][pos:pos+maxlen]+' '+seqs[i][pos+maxlen:pos+2*maxlen]+' '+seqs[i][pos+2*maxlen:pos+3*maxlen]+' '+seqs[i][pos+3*maxlen:pos+4*maxlen]+'\n'.encode(enc))
	pos = 2*maxlen
	fh.write('\n'.encode(enc))
	
	while pos < alilen:
		for i in range(taxa):
			fh.write(blank+seqs[i][pos:pos+maxlen]+' '+seqs[i][pos+maxlen:pos+2*maxlen]+' '+seqs[i][pos+2*maxlen:pos+3*maxlen]+' '+seqs[i][pos+3*maxlen:pos+4*maxlen]+'\n'.encode(enc))
		pos += 4*maxlen
		fh.write('\n'.encode(enc))
	
	fh.close()
	
	return 1

def argmax(v):
	"""Return argmax of a list"""
	mi = 0
	mv = v[0]
	for i in range(1,len(v)):
		if float(v[i]) > float(mv):
			mv = float(v[i])
			mi = i
	return mi

def slash(astr):
	if not len(astr): return '/'
	elif astr[-1] !='/': astr+='/'
	elif astr[len(astr)-2:len(astr)] == '//': astr = astr[:-1]
	return astr

def makeContigs(intervals):
	intervals.sort()
	# print intervals[0:100]
	# combine overlaps
	coverage = 0 # total bases mapped
	contigs = []
	curr = 1
	while curr < len(intervals):
		bak = curr
		start = intervals[curr-1][0]; stop = intervals[curr-1][1]
		
		while curr < len(intervals) and intervals[curr][0] <= stop:
			if intervals[curr][1] > stop: stop = intervals[curr][1]
			curr += 1
		
		contigs += [(start,stop)]
		coverage += abs(start-stop)
		curr += 1
	# print 'contigs', contigs
	return {'contigs':contigs, 'coverage':coverage}


eq = lambda s: lambda i: str(s)!=str(i)
filterstr = lambda s: lambda v: filter(eq(s), v)
roundna = lambda e: lambda x: ((x==nastr or e==nastr) and nastr) or round(float(x),int(e))
floatna = lambda x: x == nastr and nastr or float(x)
intna = lambda x: (x == nastr or x == '' or x == 'NA' and nastr) or int(x)

def factit(x):
	"""Iterative implementation of factorial function."""
	if x <= 1: return 1
	a = [0 for i in range(x+1)]
	a[0] = 1; a[1] = 1
	for i in range(2,x+1): a[i] = i*a[i-1]
	return a[x]

nchoosek = lambda n,k: factit(n)/(factit(k)*factit(n-k)) # binomial coefficient
prod = lambda lst: reduce(mul, lst) # product function

# basic statistics
def avg(lst):
	try: return reduce(add, lst)/float(len(lst))
	except TypeError: return nastr

def harmonicMean(v, minval=1e-20):
	if len(v) == 0: return 0
	# prevent 0 division
	v = map(lambda x: x==0 and minval or x, v)
	return len(v) / sum(map(lambda x: 1./x, v))


ssd = lambda lst, mu: reduce(add, map(lambda x: (x-mu)*(x-mu), lst))
def variance(lst):
	try: return divna(ssd(lst,avg(lst)), float((len(lst)-1)))
	except TypeError: return nastr

def sd(lst):
	try: return math.sqrt(variance(lst))
	except TypeError: return nastr

def percentile(v=[], k=0.5):
	"""
	Return the value of array that corresponds to the kth percentile of the data.
	"""
	if len(v) <= 0: return 0.0
	temp = map(float, v)
	temp.sort()
	n = len(temp)
	idx = float(k)*(n+1)
	lidx = math.floor(idx)
	hidx = math.ceil(idx)
	if idx < 1.0:
		return float(temp[0])
	elif idx > n-1:
		return float(temp[n-1])
	elif not (idx == lidx or idx == hidx):
		return float(temp[int(lidx)] + temp[int(hidx)])/2.0
	else:
		return float(temp[int(idx)])

median = lambda v: percentile(v,.5)
getQuants = lambda dat: ','.join([str(q)+':'+str(round(percentile(dat, q),4)) for q in [0.05, 0.25, 0.5, 0.75, 0.95]])

# list operations
head = lambda x: x[0]
tail = lambda x: x[len(x)-1]
car = lambda x: x[0]
cdr = lambda x: x[len(x)-1]

# arithmetic operations with missing values
subna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) - float(v)
multna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) * float(v)
divna = lambda m, v: ((m==nastr or v==nastr or float(v)==0.0) and nastr) or float(m) / float(v)
roundna = lambda e: lambda x: ((x==nastr or e==nastr) and nastr) or round(float(x),int(e))

def makeRange(xmin,xmax,interval=None,n=None):
	if n == None: n = int(abs(xmax-xmin)/float(interval))
	else: 
		interval = (xmax-xmin)/float(n)
		interval += interval/float(n-1)
	return [xmin+interval*i for i in range(n)]

def histogram(dat, nbins=100, bins=[], categorical=False, yfreq=False, logscale=False, logbase=10, shift=0, dtype=None):
	import math
	if logscale: dat = map(lambda x: math.log(x,logbase), dat)
	
	if len(bins) > 0: 
		# print 'problem?'
		nbins = len(bins)
		givenbins = True
	else: givenbins = False
	
	# need to create a histogram
	filt = filterstr(nastr)(dat)
	if dtype: filt = map(dtype, filterstr(nastr)(dat))
	dat = None
	if not len(filt): return [[0,0]]
	
	counts = [0 for i in range(nbins)]
	if not givenbins and not categorical: bins = makeRange(min(filt), max(filt), n=nbins)
	elif not givenbins: bins = unique(filt)
	
	# minor bin correction for plotting
	if shift != 0: bins = map(lambda x: x+shift, bins)
	
	# this is like O(N2) too slow
	if not categorical:
		for j in range(len(filt)):
			k = 0
			# print j, len(filt)
			while k < nbins and float(filt[j]) > bins[k]: 
				k +=1
			if k == nbins: k -= 1
			counts[k] += 1
	else:
		bindct = {}
		for b in bins: bindct[b] = 0
		bk = bindct.keys()
		for j in range(len(filt)): bindct[filt[j]] += 1
		# convert back to a histogram
		bins = []; counts = []
		for key in bindct.keys(): bins += [key]; counts += [bindct[key]]
	
	tmp = []
	intnbins = int(nbins)
	# counts or frequency?
	if yfreq:
		tot = float(sum(counts))
		if logscale: tmp = [[logbase**bins[k],counts[k]/tot] for k in range(intnbins)]
		else: tmp = [[bins[k],counts[k]/tot] for k in range(intnbins)]
	if logscale: tmp = [[logbase**bins[k],counts[k]] for k in range(intnbins)]
	else: tmp = zip(bins,counts)
	
	if categorical: tmp = sorted(tmp)
	
	return tmp


# dictionary keyed by ascii character returning phred q-score
# ascii 33 to 126 - full range
fastqchars = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\{', '\|', '\}', '~']
A2Q = dict(zip(fastqchars, range(len(fastqchars))))

# string conversion to integer (potentially a long)
encodeNt = lambda x: x=='A'and'1' or x=='T'and'2' or x=='C'and'3' or x=='G'and'4' or x=='N'and'5'
encodeSeq = lambda seq: int(''.join(map(encodeNt, seq)))
decodeNt = lambda x: x=='1'and'A' or x=='2'and'T' or x=='3'and'C' or x=='4'and'G' or x=='5'and'N'
decodeSeq = lambda val: ''.join(map(decodeNt, str(val)))

encodeQ = dict(zip(fastqchars, map(str,range(10,len(fastqchars)+10)))) # every character is 2 integer digits
decodeQ = dict(zip(map(str,range(10,len(fastqchars)+10)), fastqchars))
encodeQual = lambda qual: int(''.join(map(lambda x: encodeQ[x], qual)))
pairUP = lambda qual: map(lambda x: qual[x-1]+qual[x] ,range(1,len(qual),2))
decodeQual = lambda qual: ''.join(map( lambda x: decodeQ[x], pairUP(str(qual)) ))


# the core procedure
def sniper(readfiles, GR, outfile='', saveall='', tmpdir='', expPhred=30, maxreadlen=150, Lambda=.5, prior='resequence', theta=0.001, verbose=False, mindepth=1, bounds={}, ploidy=2, LIPieces=25):
	"""Bayesian genome SNP calling utilizing multiply aligned reads bounded by k mismatches.
	"""
	
	# count total genomic positions
	flg = sum(map(lambda x: len(x['seq']), GR.values()))
	# store length of each chromosome/scaffold
	GRlengths = dict( map(lambda x: (x, len(GR[x]['seq'])), GR.iterkeys()) )
	
	# Internal methods
	# ---------------------------------------------------------------
	ln = lambda x: math.log(x, math.e)
	log = lambda x: math.log(x, 10)
	lowestQ = int(-10*log(1e-16))
	
	# convert between phred score and p-value
	Q2P = lambda e: (e == 0 and 1-1e-16) or 10**(-e/10.) # avoid 1
	P2Q = lambda p: -10*log(p)
	
	A2P = dict( [ascii, 1-Q2P(A2Q[ascii])] for ascii in A2Q.keys() ) # Probability of no error
	invA2P = dict( [ascii, Q2P(A2Q[ascii])] for ascii in A2Q.keys() ) # Probability of error
	
	# Computes mismatches between 2 sequences
	derror = lambda a,b: (a == b and 0) or sum(map(lambda x,y: x != y and 1 or 0, a,b))
	
	def intervalWithinRange(baseint, queryint):
		"""Returns 0,1,2,3 according to these rules:
		0: queryint is not within base int
		1: queryint is within base int
		2: queryint spans the beginning of baseint
		3: queryint spans the end of baseint
		baseint/queryint need not already be sorted
		ie, [1000, 2000] and [2000,1000] will return same value
		"""
		# assumes sorted coordinates - slightly faster
		# baseint = sorted(baseint); queryint = sorted(queryint)
		
		# if the end of the element comes before the start of the base
		if (queryint[1] < baseint[0] or baseint[1] < queryint[0]):
			return 0
		# if element lies inside the base region
		elif (queryint[0] >= baseint[0] and queryint[1] <= baseint[1]):
			return 1
		# if we span a region
		elif (queryint[0] < baseint[0] and queryint[1] >= baseint[0]):
			return 2
		else:
			return 3
		
	
	def overlaps(tt,q):
		return sum([intervalWithinRange(t,q) for t in tt])
	
	
	def loadUniqueReads(tmpfile, fromChromosomes, filterids={}, bounds={}):
		# initialize D with chrom names
		D = dict( map(lambda x: (x,{}), fromChromosomes) )
		
		Bb = {} # set up dictionary based on bounds
		for k in bounds.keys():
			for low,high,flag in sorted(bounds[k]):
				if flag == '+':
					try: Bb[k] += [(low,high)]
					except KeyError: Bb[k] = [(low,high)]
					
		
		if not os.access(tmpfile, os.F_OK): return D
		fh = open(tmpfile)
		count = 1
		for line in fh:
			if count % 1000000 == 0: sys.stdout.write('%s,'%(count)); sys.stdout.flush()
			count+=1
			
			try:
				lst = line[:-1].split('\t')
				chrom = lst[2]
				pos = int(lst[3])-1
				
				if chrom == '*' or (chrom not in D) or lst[0] in filterids: continue
				elif chrom in Bb and overlaps(Bb[chrom], (pos,pos+len(lst[10]))) == 0: continue
				
				# collapse reads by position, so that each list index contains all reads which start at this position
				#     = (pos,           seq,     qual,    hitid)
				# stuff = (pos, lst[9], lst[10], lst[0])
				# stuff = (pos, lst[9], lst[10], hash_float(lst[0]))
				# stuff = (pos, lst[9], lst[10], hash(lst[0]), len(lst[9])) # NEW! added len(seq)
				
				# compress seq and qual here too
				stuff = (pos, encodeSeq(lst[9]), encodeQual(lst[10]), hash(lst[0]), len(lst[9])) # NEW! added len(seq)
				
				if pos in D[chrom]: D[chrom][pos] += [stuff]
				else: D[chrom][pos] = [stuff]
			# exception if a line beginning with @
			except IndexError: pass
			except ValueError: pass
		fh.close()
		return D # map(lambda pos: pos in D and D[pos] or [], range(totallen))
	
	
	def loadDegReads(readfiles, chromosomesOfInterest, bounds={}):
		"""Load the degenerate portion of the read map for any reads aligning to this chromosome."""
		
		# convert to dict
		chint = dict( map(lambda x: (x,None), chromosomesOfInterest) )
		
		Bb = {} # set up dictionary based on bounds
		for k in bounds.keys():
			for low,high,flag in sorted(bounds[k]):
				if flag == '+':
					try: Bb[k] += [(low,high)]
					except KeyError: Bb[k] = [(low,high)]
					
		
		# gather reads of interest wrt this chrom group
		readsOfInterest = {}
		pipeit(' - Scanning for degenerate reads of interest...')
		for pese in ['PE', 'SE']:
			degfile = readfiles[pese+'deg']
			if degfile:
				fh = open(degfile)
				count = 1
				for line in fh:
					if count % 1000000 == 0: pipeit('%s,'%(count))
					count+=1
					if line[0] == '@': continue
					
					lst = line[:-1].split('\t')
					chrom = lst[2]
					pos = int(lst[3])-1
					
					# only keep a read if it has an alignment on these chromosomes/bounds
					if chrom in chint: 
						readsOfInterest[lst[0]] = None
					elif chrom in Bb and overlaps(Bb[chrom], (pos,pos+len(lst[10]))) > 0: 
						readsOfInterest[lst[0]] = None
						
				fh.close()
		pipeit('Done',1)
		
		
		# load all degenerate reads
		# -------------------------
		RdxChr = {'PE':{}, 'SE':{}}
		# chrKeep = {}
		for chrom in GR.keys(): # needs all chrom keys
			# chrKeep[chrom] = None
			RdxChr['PE'][chrom] = {}
			RdxChr['SE'][chrom] = {}
			
		otherhits = {}
		OHI = {}
		
		# we actually need 2 passes through this file
		# first, fetch all reads aligned to the chromosomes of interest
		# then collect all reads with that name
		
		for pese in ['PE', 'SE']:
			degfile = readfiles[pese+'deg']
			if degfile:
				pipeit(' - Loading degenerate %s reads...'%(pese))
				fh = open(degfile)
				count = 1
				for line in fh:
					if count % 1000000 == 0: pipeit('%s,'%(count))
					count+=1
					
					try:
						lst = line[:-1].split('\t')
						chrom = lst[2]
						if chrom == '*' or lst[0] not in readsOfInterest: continue
						
						hitid = hash(lst[0]); pos=int(lst[3])-1; seq=lst[9]; qual=lst[10]
						
						# only add this read if we haven't already recorded it
						# our read maps duplicate both ends of a PE read even if 
						# only one of the mate pairs has multiple alignments
						# so we need to eliminate these duplicate entries
						# making OHI a necessity
						
						# the complete information we need for SNP calling
						stuff = (pos, encodeSeq(seq), encodeQual(qual), hitid, len(seq)) 
						
						# if initial map file has large d, user can reduce memory usage as desired by capping 
						# the number of alignments loaded into memory here
						loh = 0
						try: loh = len(otherhits[hitid])
						except KeyError: pass
						
						# make sure this isn't a duplicate entry in the map file 
						# by checking location of this alignment (see comment above)
						if hitid in OHI and (chrom,pos) not in OHI[hitid] and loh < maxnumalign:
							otherhits[hitid] += [(stuff,chrom)]
							OHI[hitid][(chrom,pos)] = None
							# and add the info to the non-unique read pileup
							if pos in RdxChr[pese][chrom]: RdxChr[pese][chrom][pos] += [stuff] 
							else: RdxChr[pese][chrom][pos] = [stuff] 
							
						elif hitid not in OHI and loh < maxnumalign:
							otherhits[hitid] = [(stuff,chrom)]
							OHI[hitid] = {(chrom,pos):None}
							# and add the info to the non-unique read pileup
							if pos in RdxChr[pese][chrom]: RdxChr[pese][chrom][pos] += [stuff] 
							else: RdxChr[pese][chrom][pos] = [stuff]
							
					# exceptions if line begins with @
					except IndexError: continue
					except ValueError: continue
				fh.close()
				pipeit('Done.',1)
				
		del OHI # no longer needed
		
		return RdxChr, otherhits
	
	# =========================================================================
	
	
	# Initialize local variables
	# =========================================================================
	if prior == 'haploid': ploidy = 1
	if ploidy == 1: prior = 'haploid'
	
	nucleotides = ['A', 'T', 'C', 'G'] # 10 possible genotypes that we can identify
	
	genotypes = None
	genoUni = None
	if ploidy == 2:
		genotypes = [('A','A'),('A','T'),('A','C'),('A','G'),('T','T'),('T','C'),('T','G'),('C','C'),('C','G'),('G','G')]
		genoUni = {('A','A'):'AA',('A','T'):'AT',('A','C'):'AC',('A','G'):'AG',
				   ('T','T'):'TT',('T','C'):'CT',('T','G'):'GT',('C','C'):'CC',
				   ('C','G'):'CG',('G','G'):'GG'}
	else:
		genotypes = ['A', 'T', 'C', 'G']
		genoUni = {'A':'A', 'T':'T', 'C':'C', 'G':'G'}
		
	rangelengenotypes = range(len(genotypes))
	GT = genotypes # alias
	invGT = None
	if ploidy == 2:
		invGT = dict( map(lambda i: ((GT[i][0],GT[i][1]),i) , rangelengenotypes) )
		invGT2 = dict( map(lambda i: ((GT[i][1],GT[i][0]),i) , rangelengenotypes) )
		invGT.update(invGT2)
	# =========================================================================
	
	
	# Parameters
	# =========================================================================
	
	# P(G_xy(l)) = pr. drawing genome at position l with genotype xy
	# --------------------------------------------------------------
	# p = log(1/10.)
	# PG = [p for i in range(lengenotypes)] # uniform prior
	
	# 0 = homozygous reference, 1 = het, 2 = homozygous derived
	if prior == 'resequence':
		# heterozygous different is rarest
		# h0, t1, h2, t2, where P(h2) = P(t1) and P(t2) == P(t1)^2
		PGvals = [1-theta-theta**2, theta/2., theta/2., theta**2]
		
	elif prior == 'resequence-het':
		# h0, t1, h2, t2, where P(h2) = P(t2)
		PGvals = [1-theta-theta**2, theta, (theta**2)/2., (theta**2)/2.]
	
	elif prior == 'maq':
		# Durbin/maq model: Pr{XX} = Pr{YY} > Pr{XY}
		# h0, t1, h2, t2
		PGvals = [(1-theta-theta**2)/2., theta, (1-theta-theta**2)/2., theta**2]
	
	elif prior == 'uniform':
		PGvals = [.25,.25,.25,.25]
	
	elif prior == 'haploid':
		# h0, t1, h2, t2, where P(t1) and P(t2) = 0
		PGvals = [1-theta, theta]
	
	else:
		sys.exit('snpper: Do not understand prior argument: %s'%(prior))
	
	
	PGlamb = None
	def PGlamb(i,ref):
		if genotypes[i][0]==ref and genotypes[i][1]==ref: return PGvals[0]
		elif (genotypes[i][0]==ref and genotypes[i][1]!=ref) or (genotypes[i][0]!=ref and genotypes[i][1]==ref): return PGvals[1]
		elif genotypes[i][0]==genotypes[i][1] and genotypes[i][0]!=ref: return PGvals[2]
		elif genotypes[i][0]!=genotypes[i][1] and genotypes[i][0]!=ref and genotypes[i][1]!=ref: return PGvals[3]
	
	if ploidy == 1:
		# maps a genotype given reference to 0,1 = same as ref, or different
		def PGlamb(i,ref):
			if genotypes[i]==ref: return PGvals[0]
			else: return PGvals[1]
		
	
	# make into a lookup dictionary
	PG = {}
	for i in rangelengenotypes:
		for nt in nucleotides: PG[i,nt] = PGlamb(i,nt)
	
	thetastr = ', Theta = %s'%(theta)
	if prior == 'uniform': thetastr = ''
	print ' - Using global prior probabilities: Prior model = %s%s'%(prior, thetastr)
	# =========================================================================
	
	
	# Precompute nt probability scores
	# =========================================================================
	# PriS = pr. sequencing read ri from template sequence S
	# --------------------------------------------------------------
	maxphred = 40
	# generalize this for all read lengths - 0 to 200
	binCoef = [[nchoosek(rl,d) for d in range(rl+1)] for rl in xrange(maxreadlen+1)]
	# call: binomialP(readlength)(mismatches, Perror)
	# tune between read/position-specific error vs a global/position specific error
	binomialP = lambda rl: lambda d,p: binCoef[rl][d] * p**d * (1-p)**(rl-d)
	expError = 10**(-expPhred/10.)
	PriSbin = [[Lambda * binomialP(rl)(d, expError) for d in range(rl+1)] for rl in xrange(maxreadlen+1)]
	
	# index by [rl][offset]
	offsetRange = [map(lambda offset: range(offset)+range(offset+1, rl), xrange(maxreadlen+1)) for rl in xrange(maxreadlen+1)]
	
	ntPriS = None
	opLambda = 1. - Lambda
	
	# binomial + quality model
	if Lambda < 1.0:
		print ' - Using tuned binomial+quality likelihood model with lambda=%s, global error=%s'%(Lambda, expPhred)
		
		def ntPriS(S, R, Q, offset, rl):
			# localize variables for speed
			oR = offsetRange
			Pb = PriSbin
			lA2P = A2P
			linvA2P = invA2P
			nucs = nucleotides
			
			d = 0 # number of mismatches in the alignment
			p = 1 # read/position specific likelihood
			# avoid the offset position
			for i in oR[rl][offset]:
				p *= (S[i] == R[i] and lA2P[Q[i]]) or linvA2P[Q[i]]
				d += int(S[i] != R[i])
				
			# complete with offset position
			return dict( map(lambda nt: (nt, Pb[rl][d+int(S[offset]!=nt)] + opLambda*p*(S[offset]==nt and lA2P[Q[offset]] or linvA2P[Q[offset]])), nucs) )
		
		
	else:
		print ' - Using binomial only likelihood model with global error=%s'%(expPhred)
		# binomial only model when lambda = 1
		# this will be faster
		
		def ntPriS(S, R, Q, offset, rl):
			# localize variables for speed
			oR = offsetRange
			Pb = PriSbin
			nucs = nucleotides
			
			d = 0 # number of mismatches in the alignment
			# avoid the offset position
			
			# for i in range(offset)+range(offset+1,rl):
			for i in oR[rl][offset]:
				d += int(S[i] != R[i])
			# complete with offset position
			return dict( map(lambda nt: (nt, Pb[rl][d+int(S[offset]!=nt)]), nucs) )
		
		
	# DONE CHOOSING METHOD
	
	
	# PriSj = pr. that Sj was the template out of all SGri templates
	# --------------------------------------------------------------
	maximumcoverage = 100001 # this will crash if any locus has greater read depth
	PriSj = [nastr]+map(lambda n: 1/float(n), range(1,maximumcoverage)) # uniform, precomputed
	
	# indicator variable that returns 1 if genotype is homozygous and 2 if heterozygous
	# used because we are only considering the 10 unique genotypes
	I2xHet = {}
	if ploidy == 1:
		for X in genotypes: 
			I2xHet[X] = 1.
	else:
		for X,Y in genotypes:
			if X==Y: I2xHet[(X,Y)] = 1.
			else: I2xHet[(X,Y)] = 2.
	
	
	# control plot for phred to Pvalue transformation
	# lst = [1-1/10**(x/10.) for x in range(34)
	# plot.scatter(zip(range(34),lst), style='linespoints', xlabel='phred', ylabel='1-error rate')
	
	# Set up bounds for subset of genome to process
	# =========================================================================
	GRNA = {}
	
	if len(bounds.keys()):
		flg = 0
		print ' - We have genotyping bounds...'
		null = [nastr, 'NA']
		for tmpk in GR.keys():
			master = GR[tmpk]['seq'][:]
			SEEN = False
			for k in [tmpk, tmpk.lower(), tmpk.upper()]:
				if k in bounds and not SEEN:
					SEEN = True
					lastpos = 0
					# may be many positions of interest
					for low,high,flag in sorted(bounds[k]):
						if flag == '+':
							# remove positions not specified in a range
							
							if low not in null and high not in null:
								# we have a range
								tmps = master
								tmp = ''.join(map(lambda x: 'N', tmps[0:low]))
								tmp += tmps[low:high+1]
								tmp += ''.join(map(lambda x: 'N', tmps[high+1:len(tmps)]))
								master = tmp
							elif low not in null:
								# we have a single position of interest
								# since we've sorted these positions, 
							
								# erase between lastpos and low
								tmp = master[0:lastpos+1]
								tmp += ''.join(map(lambda x: 'N', master[lastpos+1:low]))
								tmp += master[low]
								tmp += master[low+1:len(master)]
								master = tmp
								lastpos = low
							# else just leave the entire chromosome intact
						elif flag == '-':
							# set ranges to NA
							if low not in null and high not in null:
								tmps = master
								tmp = tmps[0:low]
								tmp += ''.join(map(lambda x: 'N', tmps[low:high+1]))
								tmp += tmps[high+1:len(tmps)]
								master = tmp
							
					# now I can finish erasing the end
					if lastpos:
						master = master[0:lastpos] + ''.join(map(lambda x: 'N', master[lastpos:len(master)]))
					
					# write back
					GRNA[tmpk] = {'id':tmpk, 'seq':master}
					flg += len(master)
	else: GRNA = GR
	# =========================================================================
	
	
	# Initialize variables for likelihood computation
	# =========================================================================
	totalLoci = 0     # count number of reference positions passed
	numHypotheses = 0 # count up number of hypotheses we make
	
	outfh = None
	if outfile: outfh = open(outfile, 'w')
	alloutfh = None
	if saveall: alloutfh = open(saveall, 'w')
	
	# blank template list for likelihood calculations below (copy from this)
	LLg0 = [0 for i in genotypes] # if doing likelihood sums
	
	# now proceed to genotype each reference genomic position
	chromosomesOfInterest = sorted(GRNA.keys())
	nchroms = len(chromosomesOfInterest)
	
	# split chromosomes into LIPieces loading groups
	chromgroups = []
	copydir = chromosomesOfInterest[:]
	if LIPieces > nchroms: LIPieces = nchroms
	itemspergroup = math.ceil(nchroms / float(LIPieces))
	
	while len(copydir):
		currgrp = []
		while len(currgrp) < itemspergroup and len(copydir):
			currgrp += [copydir.pop(0)]
		chromgroups += [currgrp]
		
	globRu = {'PE':{}, 'SE':{}}
	RdxChr = {}; otherhits = {}
	
	lipct = 0
	for chromosomesOfInterest in chromgroups:
		lipct += 1
		
		# load the relevant reads for this chromosome group
		# -------------------------------------------------
		PESEs = ['SE', 'PE']
		if not USESINGLETONS: PESEs = ['PE']
		
		# unique reads
		for pese in PESEs:
			pipeit(' - Loading unique %s reads | loading group %s/%s...'%(pese, lipct, LIPieces))
			globRu[pese] = loadUniqueReads(readfiles[pese+'uni'], chromosomesOfInterest, bounds=bounds)
			pipeit('Done.',1)
		
		# degenerate reads
		RdxChr, otherhits = loadDegReads(readfiles, chromosomesOfInterest, bounds=bounds)
		
		# process the reads
		for chrom in chromosomesOfInterest:
			GRseq = GR[chrom]['seq'] # want actual sequence for proper derror calculations
			chromlen = len(GRseq)
			
			# clear read pileup and marked list every new chromosome
			SGr.clear() # stores read pileup - this is a global variable
			
			prevGi = -maxreadlen # remember the last position Gi we visited
			
			# ITERATE OVER REFERENCE GENOMIC POSITIONS
			# ----------------------------------------
			for Gi in xrange(chromlen):
				totalLoci += 1
				
				# is this position masked due to user-specified bounds?
				if GRNA[chrom]['seq'][Gi] == 'N': continue # can't resequence without a reference
				
				ref = GRseq[Gi] # actual nucleotide in reference genome at position Gi
				
				if Gi % 10000 == 0: pipeit('   %s %s/%s = %s (Hypos=%s | %.2f'%(chrom, Gi,chromlen, ref, numHypotheses, 100*(totalLoci/float(flg)))+'% complete)',1)
				
				
				# GET READ PILEUP FOR THIS POSITION
				# ---------------------------------------------------------------
				# all I really need to do here is delete entries that no longer overlap and add new ones
				# add entries from new positions that do overlap
				for pos in xrange(max(0, (Gi-maxreadlen)+1, prevGi+1), min(Gi+1, chromlen)):
				
					# having group-indexed the reads by starting position we can do this
					# each key of SGr is the read id, returning a list of pos,seq,phred triples
					
					# FETCH UNIQUE READS OVERLAPPING THIS POSITION
					for pese in PESEs:
						if pos not in globRu[pese][chrom]: continue
						# iterate over reads starting at this position
						for (epos, eseq, equal, ehitid, esrl) in globRu[pese][chrom][pos]:
							SGr[ehitid] = [(epos, decodeSeq(eseq), decodeQual(equal), esrl)]
							
					# FETCH DEGENERATE READS OVERLAPPING THIS POSITION
					for pese in PESEs:
						if pos not in RdxChr[pese][chrom]: continue
						# iterate over reads starting at this position
						for (epos, eseq, equal, ehitid, esrl) in RdxChr[pese][chrom][pos]:
							SGr[ehitid] = [(epos, decodeSeq(eseq), decodeQual(equal), esrl)]
							# add conditional probs for other alignments
							for (opos, oseq, oqual, ohitid, osrl),ochrom in otherhits[ehitid]:
								SGr[ehitid] += [(opos, ochrom, decodeSeq(oseq), decodeQual(oqual))]
								
				# update previous position we visited
				prevGi = Gi
				# ---------------------------------------------------------------
				# DONE OBTAINING PILEUP FOR THIS POSITION
				
				# ASSESS THE GENOTYPE AT THIS POSITION
				# ------------------------------------
				# Compute product likelihood of read pileup across genotypes
				LL = LLg0[:]                             # read-sum log likelihoods
				AF = {'A':0, 'T':0, 'C':0, 'G':0, 'N':0} # allele frequency distribution
				multiplicity = 0 # number of reads that are degenerate (have >1 alignment; some k < n)
				totalmultiplicity = [] # total number of degenerate alignments (d-n)
				# mmpr = [] # number of alignment mismatches per read
				marked = [] # list of readids marked to be deleted
				
				for readid,alignments in SGr.iteritems():
					# if we don't overlap, then discard and continue
					
					# first alignment overlaps current position on chrom
					(pos,seq,qual,curRL) = alignments[0]
					offset = Gi-pos   # what is nt of interest wrt this read?
					# curRL = maxreadlen # fixed length reads
					# curRL = len(seq)  # this read's read length
					
					# then this read doesn't actually overlap current locus
					if offset >= curRL: marked += [readid]; continue
					
					# need to know the size of SG(l), the k-bounded string set 
					# for the resequenced genome SG at position l
					# that is, total number of degenerate alignments + alignment at this position
					numTemplates = len(alignments) # = d+1, where d is multiplicity for this read
				
					AF[seq[offset]] += 1 # update allele counts
				
					# now have d alignments, which are all degenerate
					multiplicity += (numTemplates>1 and 1) or 0 # are there any degenerate alignments?
					totalmultiplicity += [numTemplates-1] # add d = len(alignments)
				
					#    likelihood = (1/d+1)P(ri|Gl=x) + (1/d+1)Pdeg1 + ... + (1/d+1)Pdegd
					#               = (1/d+1)[P(ri|Gl=x) + Pdeg1 + ... + Pdegd]
					#               = (1/d+1)[P(ri|Gl=x) + sum_d(Pdeg)]
					#               ~ (1/d+1)P(ri|Gl=x) + (1/d+1)d x Pdeg
					#    so we are just computing a weighted average
					
					# now the 1/d+1 term is the probability of one genomic segment 
					# serving as the template for sequencing: P(s | G)
					# here this is uniformly distributed across reads
					# below is an implementation where P(s | G) ~ P(s | ri, G)
					# where we solve in a Bayesian fashion:
					# P(s | ri, G) = P(ri | s, G)P(s,G) / Sum P(ri | s, G)P(s,G)
					# with uniform prior this reduces to
					# P(s | ri, G) = P(ri | s, G) / Sum P(ri | s, G)
					# which is a weighting based on relative strength of the read aligning to 
					# that genomic region
					# basically this says that for degenerate reads, give the most weight 
					# to the location with fewer mismatches or higher quality
					
					
					# if ploidy = 2, then really have 2d + 2 alignments for this read
					Pcond = ploidy == 1 and PriSj[numTemplates] or PriSj[2*numTemplates]
					# Note, we could formulate Pcond in a more read-specific manner...
					
					# compute the likelihood for this read given the offset position
					Prl = ntPriS(seq,GRseq[pos:pos+curRL],qual,offset,curRL)
					
					# sum up the degenerate probability mass: info = (opos, ochrom, seq, oqual)
					# consider variant reference genotypes at degenerate positions - same offset as reference
					# Pdeg = sum( map(lambda info: max(ntPriS(info[2],GR[info[1]]['seq'][info[0]:info[0]+len(info[2])],info[3],offset).values()), alignments[1:]) )*Pcond
					Pdeg = sum( map(lambda info: max(ntPriS(info[2],GR[info[1]]['seq'][info[0]:info[0]+curRL],info[3],offset,curRL).values()), alignments[1:]) )*Pcond
					
					twoPdeg = 2*Pdeg # saves on computation time
					
					LL = ploidy == 1 and map(lambda g: LL[g] + log(Pcond*Prl[GT[g]] + Pdeg), rangelengenotypes) or \
				 	                 map(lambda g: LL[g] + log(Pcond*(Prl[GT[g][0]] + Prl[GT[g][1]]) + twoPdeg), rangelengenotypes)
					
				# done loop over reads
				
				map(delete,marked) # delete marked reads
				
				# what is final read depth at position Gi in resequenced genome?
				depth = len(SGr)
				
				# don't print out info if position has no depth
				if depth < mindepth: continue
				
				# need to scale down the likelihoods so that natural scale values don't become 0
				# we are currently on log scale so values are large negative exponents
				if sum(LL) < 1e-250: # that is some really small value
					# to avoid numerical underflow due to exponentiation of large negative LL values
					# need to find a positive constant C such that math.e^LL+C is computable, eg < 1000
					
					# precursor step is to do blind rescaling to bring values near computable range
					# if you don't do this, you will inevitably set many positions values to equality
					# whereas they could be quantified just fine
					maxll = max(LL)
					while maxll < -200:
						c = -maxll/4. # this can't become positive
						LL = map(lambda x: x+c, LL)
						# print >> sys.stderr, 'Step:', lik
						maxll = max(LL)
						
					
					badvalue = -100000000000000000000000000000000 # token indicator of too small value
					LL = map(lambda x: (x < -250 and badvalue) or x, LL)
					
					# confirm that not all have badvalue
					if len(filter(lambda x: x!=badvalue,LL)) == 0:
						print >> sys.stderr, 'Warning: All likelihoods are 0 at position', Gi
						# then all likelihoods have the same value
						LL = map(lambda x: -250, LL)
					else:
						idx = argmax(LL) # get the largest likelihood
						candC = -LL[idx] - 100 # makes the max possible log likelihood = -100: 10**-100 = 1e-100
						# all other liklihoods will be smaller than this
						LL = map(lambda x: (x == badvalue and -250) or (x+candC), LL)
				# DONE CORRECTION
				
				
				# Return likelihoods to natural scale
				LL = map(lambda x: 10**x, LL)
				
				# Allele frequency
				# ----------------
				AFstr = ','.join( map(lambda nt: nt+':'+str(round(AF[nt]/float(depth),2)), nucleotides) )
				
				# Posterior probabilities
				# ----------------------------------------------
				# we now have a length-10 vector of likelihood values 
				# for all reads at Gi given each genotype
				# calculate the posterior probability of each possible genotype
				# given the read set
			
				# we must calculate the PGxyR for all possible genomic string configurations C(SGxy(l))
				# assuming genotype at position l is xy
				# so sum up prob over the C(SGxy(l)) for each genotype xy
			
				# so basic assumption is G = GR so that C(SGxy(l)) is simply the k-bounded
				# set of the reference: C(SGxy(l)) = C(SGRxy(l))
				# this way there is no summing involved
				
				# Multiplicity ratio: d-n / n (or Ndeg / N), ranging from 0 to inf
				# aka the average number of additional alignments per read / locus
				# Sum(d_read) / Sum(reads) per locus
				# --------------------------------------------------
				udalpha = round(harmonicMean(totalmultiplicity), 3)
				
				PGxyR = None
				try:
					# numerator: P(r1...rn | Gxy(Gi)) x P(Gxy(Gi))
					if ploidy == 1:
						PGxyR = map(lambda g: LL[g]*PG[g,ref], rangelengenotypes)
					else:
						dnt = {'A':None,'T':None,'C':None,'G':None}; del dnt[ref]
						dL = max(map(lambda alt: LL[invGT[(ref,alt)]], dnt.keys()))
						
						homweight = max(1e-10, 1.-(udalpha+1)*theta) # so minimum P(RR) = e-10
						hetweight = udalpha*theta
						
						PGxyR = map(lambda g: GT[g][0]==ref and GT[g][0]==GT[g][1] and \
					                          LL[g]*homweight + dL*hetweight or \
					                          LL[g]*PG[g,ref], rangelengenotypes)
						
					# denominator: Sum_xy{ P(r1...rn | Gxy(Gi)) x P(Gxy(Gi)) }
					Ptotal = float(sum(PGxyR))
				except OverflowError:
					print 'snpper(overflow error):\nLL=',LL, '\nprior=', [PG[i,ref] for i in rangelengenotypes]
					continue
					
				
				# Final posterior probabilities for all genotypes at position Gi
				# --------------------------------------------------------------
				# double up the heterozygous probabilities, e.g. P[]
				PGxyR = map(lambda i: I2xHet[genotypes[i]]*PGxyR[i], rangelengenotypes)
				Ptotal = float(sum(PGxyR))
				PP = map(lambda x: x/Ptotal, PGxyR)
				
				# Determine MAP genotype and posterior
				# ------------------------------------
				# take argmax, but multiple genotypes may have similar or identical PP. 
				# This would happen if the pileup is highly degenerate. 
				idx = argmax(PP)
				geno = genoUni[genotypes[idx]]
				MAP = PP[idx]>1 and 1 or PP[idx] # force to 1
				Q = nastr
				try: Q = int(-10*log(1-MAP)) # convert to phred score
				# this occurs if MAP = 1, so force to some arbitrary minimum pvalue
				except ValueError: Q = lowestQ # this will be Q = 160
				# occurs if MAP is < about 1e-16
				except OverflowError: Q = lowestQ # this will be Q = 160
				
				# Record statistics
				info = [chrom, str(Gi), ref, geno, str(Q), str(depth), str(multiplicity), str(sum(totalmultiplicity)), '%.3f'%(udalpha), 'N', AFstr]
				
				# fixed haploid/diploid issue
				genoref = (ploidy > 1 and iupac2nts(ref)) or ref
				if MAP == 0: info[3] = genoref # default to ancestor
				# print 'genoref', genoref, 'ref', ref, 'geno', geno
				# Assess nominal significance (P<0.05 and writes on the fly)
				# ---------------------------
				numHypotheses += 1 # we are evaluating a hypothesis
				if Q>=13 and genoref != geno: 
					info[9] = 'Y'
					info = '\t'.join(info)
					if verbose: print info
					# print >>outfh, info # nominal file - deprecated
				else: info = '\t'.join(info)
				
				# print info
				print >>alloutfh, info # if printing all base calls
			# DONE LOOP OVER POSITIONS
		# DONE LOOP OVER CHROMOSOMES
	# DONE GENOME
	if outfile: outfh.close()
	if saveall: alloutfh.close()
	
	return numHypotheses


# -----------------------------------------------------------------
# -----------------------------------------------------------------
# END LOCAL METHODS


# CLI VARIABLES
# -------------------------------------------------------------------
outdir = slash(os.getcwd())
btdir = ''
btmapdir = ''; SETMAPDIR = False
MAPCUT = False
findir = ''
mapfile = ''
bfaref = ''
zkey = 'sniper_results' # Default folder name for processed samples


########################### SNIPER PARAMETERS ###########################
#########################################################################

# PART I: ALIGNMENT PARAMETERS
# ----------------------------
RUNBOWTIE = False               # align reads to reference genome using bowtie
REALIGN = False                 # realign reads
nprocs = 4                      # number of processors per computer for bowtie assembly
alignmentPolicy = 'bestm'       # basic, best, all, bestk, bestm, maq-bestm, bestm-stratum, best-stratum
mismatches = {'SE':1, 'PE':1}   # number of mismatches for bowtie alignment (-v mismatches[x])
maxnumalign = 25                # maximum number of alignments a read could have, 
                                # subject to -mm mismathces (ONLY FOR policies bestm and bestk)
                                # set this to the expected read coverage
readfileformat = 'fastq'        # or fasta
COUNTREADS = False              # count number of mapped reads from bowtie output
qualstr = '--phred64-quals'     # default is 64-base phred scores for Illumina GA 1.3+
trim5 = ''                      # trim 5' end of reads by trim5 bases
trim3 = ''                      # trim 3' end of reads by trim3 bases
COLORSPACE = False              # use bowtie colorspace

# mate pair insertion size bounds
minins = 100; maxins = 400       # default lower and upper bounds on insert size
GETINSERTSIZE = False           # estimate PE insert size distributions
insstat = '.99'                 # take as insert bounds the inner insstat percentiles


# PART II: READ FILE ORGANIZATION
# -------------------------------
ORGANIZEREADS = False           # organize bowtie ouput into 4 read files per sampleID: unique/deg/PE/SE
REORGANIZEREADS = False         # redo this read organization
KEEPBOWTIE = False              # retain the bowtie output in addition to organized map files...

# PART III: SNP CALLING
# ---------------------
SNPCALLING = False              # whether or not to perform genotyping
REDOSNPCALLING = False          # whether to redo SNP calling on a sampleID which has already been typed
REDOFINALSNPCALLING = False     # correct for multiple testing to return final significant snps
UNIQ = False                    # genotype using uniqely aligned reads only (--bestm -m 1)
BEST = False                    # genotype using best-guess aligned reads only (--bestk -m 1)
                                # if UNIQ = BEST = False, align using --bestk/m -m d
USESINGLETONS = True            # genotype using both PE and SE reads
boundsfile = ''                 # file containing chrom\tlow\thigh rows for restricting genotyping
singlesampleid = []           # name of single sampleID to genotype
LOWSID = None; HIGHSID = None   # if user wants to bound which samples are genotyped in a map file
LIPieces = 25                   # Break loading of unique reads into this number of pieces
                                # greatly reduces memory requirements
nthreads = 1                    # number of active threads - process different sampleIDs to the same reference
                                # unfortunately, python can't distribute over separate processor cores,
                                # so setting > 1 is rather fruitless

maxreadlen = 150                # currently global hardwired string length of all reads
Lambda = 0.67                   # weighting factor; larger values up-weight global 
                                # binomial pr. vs. read-specific phred prob.

expPhred = 35                   # Expected per-nucleotide sequencing error for global binomial likelihood model
ESTIMATEGLOBALERROR = False     # estimates the value for expPhred
prior = 'resequence'            # resequence, maq, uniform, haploid
theta = 0.001                   # expected human heterozygosity
colorspacetheta = (3/2.)*theta  # overall haploid divergence for sample--reference genome for bowtie colorspace conversion
0.000000198
# Default range of Q values for significant SNP identification
QvalueRange = [0, 5, 13, 20, 40, 80, 100, 160]


# SNP POSTPROCESSING SETTINGS
# ---------------------------
MAXALPHA = None                 # don't call any SNPs that have alpha > 10
MINALPHA = 0.0
POSTFILTER = False # similar to maxalpha but looks at num deg reads / num reads at locus
                   # instead of num deg alignments vs num reads at locus
DDEGCUTOFF = 0.9   # if more than 90% of depth is due to degenerate reads then can't be a SNP
REDUNDANCYFILTER = None         # Can optionally include a file of redunant loci to mask from SNP calling

POSTPROCESSING = False        # use a maq-like quality filter
                              # allows 2 criteria to call a SNP
FDEPTH = [10,50]              # if depth at least x
FDEG = [5,1]                  # if depth/degeneracy ratio at least x
FQ = [13,13]                  # if posterior prob (as phred) at least x

# OTHER PARAMETERS
# ----------------
REDOALL = False               # redo everything
DIAGNOSTICS = False           # generate summary statistics for genotype results
SAVEASSEMBLY = False          # save new genome using new SNP calls
SKIPHIST = False
specialReferenceFiles = None

# Internal
# READLENBYUSER = False
DRYRUN = False                # dryrun the pipeline
SAVENOMINAL = False           # save P < 0.05 SNPs while processing genome

##########################################################################
##########################################################################


# CLI ARGS
# -------------------------------------------------------------------
help = """
Help for sniper.py

Help screen: sniper.py --help
Version information: sniper.py --version

Requires Unix/Mac OS X/CYGWIN with Python 2.5 or 2.6 (preferably 64-bit for memory allocation > 4GB)

Standard execution: sniper.py -m <map_file.txt> -r <reference_genome.fasta> -z

Two-step execution for paired-end read data:
1. sniper.py -m <map_file.txt> -r <reference_genome.fasta> --ins
2. sniper.py -m <map_file.txt> -r <reference_genome.fasta> -z

This will align raw NGS reads, perform genotyping, and provide diagnostics 
and new assemblies for all samples according to the map file. Results will
be stored in a new directory 'sniper_results' in your current working directory.

Standard arguments:
* -m: map file (required)
* -r: reference genome (required, fasta format)
* -o: primary directory to save results directory (default: current working directory)
* name: name of the results directory (default: sniper_results)
* -z: perform/redo all procedures (equivalent to '-a -O -s --diagnostics --saveassmebly)
* -a: align reads (generate read map)
* -O: organize read map into singly-mapped and multiply-mapped reads
* -s: SNP calling
* --lip <INT>: number of separate pieces to load into memory; typically <= number of chromosomes in the reference genome (larger values are slower but use less memory)
* --diagnostics: generate text file summarizing SNP calls
* --saveassembly: create a new fasta file of the reference genome modified by significant SNP calls
                  (NB it is recommended to use in conjunction with '-Q' to specify desired phred stringency.
                   Ex: --saveassembly -Q 40 or --saveassembly -Q 20,30,40

Important parameters:

sniper.py -m <map_file.txt> -r <reference_genome.fasta> -z -k 2 -d 10 -e 30 --lambda 0.5 --readlength 38 --prior resequence --theta 0.01

Arguments:
* -k: maximum number of alignment mismatches (default: 1)
* -d: maximum number of alignments per read (default: 25)
* -e: global sequencing error rate (specified as phred value; E.g. Q=30 == P<0.001) (default: 35)
* --lambda: reweight the likelihood model towards read-specific or global binomial probability (default: 0.67 towards binomial)
* --readlength: length of each sequence read (default: automatic)
* --prior: prior probability distribution (default: resequence)
        Options: 
           resequence:     h0:1-theta-theta^2, t1:theta/2, h2:theta/2, t2:theta^2
           resequence-het: h0:1-theta-theta^2, t1:theta, h2:theta^2/2, t2:theta^2/2
           maq:            h0:(1-theta-theta^2)/2, t1:theta, h2:(1-theta-theta^2)/2, t2:theta^2
           uniform:        h0:0.25, t1:0.25, h2:0.25, t2:0.25
           haploid:        h0:1-theta, t1:theta
* --theta: expected divergence (heterozygous and homozygous) from reference genome (default: 0.001)

Other notable arguments:
* --peonly: only use paired end reads for SNP identification
* --uniq: only use uniquely mapping reads for SNP identification
* --colorspace: input files are in ABI SOLiD colorspace format (suffix .csfasta and .qual)
                NB, currently only works with single-end reads
* --colorspacetheta: Haploid proportion of divergence between sample and reference genomes (default: 3/2 theta)
* --loadinpieces/--lip: Integer number of groups into which unique reads will be broken (default: 10).
                        Larger values will decrease memory requirements.
* --stringency/-Q: Comma-delimited list of phred-quality scores for which to compute significance
                   (Default: 0,5,10,13,20,30,40,50,60,70,80,90,100,110,120,130,140,150,155,160)
* --restrict: Specify a region or file containing a list of regions over which to perform genotyping (default: all)
              EX: --restrict chrom01,0,1000,+ will genotype positions 0 to 1000 of chromosome 1, inclusive
              EX: --restrict chrom01,0,1000,- will genotype all of chromosome 1 excluding positions 0 to 1000
              A boundary file should be a tab-delimited list of one or more regions similarly described, replacing commas with tabs.
* To estimate the insert size range for paired-end reads, use '--ins' (see below)
* To estimate the expected per-nucleotide sequencing error rate, use '--globalerror'
* To restrict genotyping to a subset of samples specified in the map file, use '--sampleids sid1,sid2,...,sidx'

Map file format:

The map file is a standard tab-delimited text file indicating how each raw sequence reads file 
should be processed by sniper. The format is organized as one row per sequenced lane and requires 
7 columns of information row, as follows:

#Run	Lane	SampleID	Alias	RefGenome	PairedEnd	Path
1	1	myfirstgenome	foo	/path/to/ref_egenome	0	/path/to/reads/run1/
2	5	mysecondgenome	bar	/path/to/ref_egenome	1	/path/to/reads/run2/

In this example, the first sample was run on lane 1 and contains single-end reads. The second sample 
was sequenced on lane 5 and contains paired-end reads. The standard mapper used with sniper.py is bowtie, 
so the specified reference is the name of the respective index file, excluding file suffix.

The first line shows the headers but is treated as a comment; comments are discarded by Sniper.

Raw sequence read files should contain as a substring the label indicated in the run column and placed in the /path/to/reads/directory/ directory. 

For example the first sample file may be named 's_1_sequence.txt' or 'testing1.txt'. 
The second sample should have 2 files since it is paired-end, named 's_5_1_sequence.txt' 
and 's_5_2_sequence.txt'. Colorspace files should take the format 's_1_sequence.csfasta' 
and 's_1_sequence.qual' for the fasta and quality files, respectively. 

Additional fields may be added to the map file after the required columns (e.g. concentration, date sequenced, author, etc.).


Estimating insert size distribution for paired-end data:

By default paired-end reads are mapped using an insert size range of 0 to 350 nt. Users may specify custom min and max bounds using

* --minins: Specify the minimium insert size for all samples
* --maxins: Specify the maximum insert size for all samples

Alternatively use '--ins' to estimate the empirical insert size distribution for each sample specified in the map file:

* --ins: Estimate insert size distributions for all samples specified in the map file.
* --insstat <pct>: Use the percentile pct of the insert size distribution for alignment (default: 0.99)

Note that alignments must first be generated using '-a'. A file 'PE_insert_size.txt' will be created in the specified output directory that Sniper will use by default if samples are mapped again (using '--realign/--ra'). 


Other CLI arguments:
* --bowtie/-btd: Specify the base directory of your bowtie installation (e.g. --btd /usr/bin/bowtie-0.12.5) (default: '')
* --keepbowtie/--kb: Do not delete the original bowtie SAM file following read organization
* --policy: Specify Bowtie alignment policy (default: bestk)
            Options:
            * 'basic': 'bowtie -n <mismatches> -m 1 (Maq-like)
            * 'all': 'bowtie -v <mismatches> -a (end-to-k all alignments)
            * 'bestk': -v <mismatches> -k <max num alignments> --best
            * 'bestm': -v <mismatches> -a -m <max num alignments> --best
* --peonly: utilize paired-end reads only for SNP calling
* --kpe: Permit a maximum number of alignment mismatches for paired-end reads
* --kse: Permit a maximum number of alignment mismatches for single-end reads
* --qual33: Read qualities are on phred 33 scale instead of default phred 64 scale
* --trimreads n5,n3: For alignment, consider a read substring from 1+n5...|r|-n3
* --colorspace/-C: NGS data are in ABI SOLiD colorspace
* --realign/--ra: perform a new Bowtie alignment, replacing existing alignments
* --reorganize/--rO: generate new organized read maps, replacing existing ones
* --redosnp/--rs: redo SNP calling, replacing existing files in the snps/ subdirectory
"""

version = """sniper.py Version 1.0 

Daniel F. Simola
Laboratory of Junhyong Kim
University of Pennsylvania
http://kim.bio.upenn.edu/software/sniper.shtml
"""


nhelps = 0; helplimit = 0
args = sys.argv[:]
argstr = ''.join(args)
ai = 1
userformat = False

while ai < len(args):
	arg = args[ai].strip('-').strip('--')#.lower()
	try: val = args[ai+1]
	except IndexError: val = ''
	
	if re.match('out|^o$', arg): outdir = slash(val)
	elif re.match('^bowtie$|^btd$', arg): btdir = slash(val)
	elif re.match('^map$|^m$', arg): mapfile = val
	elif re.match('^btmapdir$|^bmd$', arg): SETMAPDIR = True; btmapdir = val
	elif re.match('^mapcut$|^mc$', arg): MAPCUT = True; ai-=1
	elif arg == 'fasta': readfileformat = 'fasta'; ai-=1
	elif re.match('^name$', arg): 
		zkey = val
		if zkey[len(zkey)-1] == '/': zkey = zkey[0:len(zkey)-1]
		
	elif re.match('^ref$|^b$|^r$', arg): bfaref = val
	elif re.match('policy|^pol$', arg): alignmentPolicy = val
	elif re.match('^mcap$|^d$', arg): maxnumalign = int(val)
	elif re.match('^qual33$', arg): qualstr = '--phred33-quals'; ai-=1
	elif re.match('^trimreads$', arg): trim5,trim3 = map(int, val.split(','))
	elif re.match('^colorspace$|^C$', arg): COLORSPACE = True; ai-=1
	
	elif re.match('align|^a$', arg): RUNBOWTIE = True; ai-=1
	elif re.match('realign|^ra$', arg): REALIGN = True; ai-=1
	# elif re.match('countreads$', arg): COUNTREADS = True; ai-=1
	
	elif re.match('usesingletons|pese|sepe|^g$', arg): USESINGLETONS = True; ai-=1
	elif re.match('peonly|^pe$', arg): USESINGLETONS = False; ai-=1
	elif re.match('^uniq$|^unique$', arg): 
		# only return an alignment if it is unique in the reference genome
		# given specified maximum mismatches
		UNIQ = True; alignmentPolicy = 'bestm'; maxnumalign = 1; ai-=1
	elif re.match('^best$|^bestguess$', arg): 
		# return an alignment if it is the one with the fewest mismatches
		# given specified maximum mismatches
		BEST = True; alignmentPolicy = 'bestk'; maxnumalign = 1; ai-=1
	
	elif arg == 'k': mismatches = {'SE':int(val), 'PE':int(val)}
	elif arg == 'kpe': mismatches['PE'] = int(val)
	elif arg == 'kse': mismatches['SE'] = int(val)
	
	elif arg == 'ins': GETINSERTSIZE = True; RUNBOWTIE = True; ai-=1
	elif arg == 'insstat': insstat = str(val)
	elif re.match('minins', arg): minins = int(val)
	elif re.match('maxins', arg): maxins = int(val)
	
	# read organization
	elif re.match('organize|^O$', arg): ORGANIZEREADS = True; ai-=1
	elif re.match('reorganize|^rO$', arg): ORGANIZEREADS = True; REORGANIZEREADS = True; KEEPBOWTIE = True; ai-=1
	elif re.match('^keepbowtie$|^kb$', arg): KEEPBOWTIE = True; ai-=1
	elif arg == 'skiphist': SKIPHIST = True; ai-=1
	
	# SNP calling
	elif re.match('^snp$|^s$', arg): SNPCALLING = True; ai-=1
	elif re.match('^redosnp$|^rs$', arg): SNPCALLING = True; REDOSNPCALLING = True; REDOFINALSNPCALLING = True; ai-=1
	# this just re-executes the significant SNP filter
	elif re.match('^redocorrection$|^rc$', arg): SNPCALLING = True; REDOFINALSNPCALLING = True; ai-=1
	elif re.match('^redocorrectionpost$|^rcf$', arg): SNPCALLING = True; REDOFINALSNPCALLING = True; POSTPROCESSING = True; ai-=1
	elif re.match('^loadinpieces$|^lip$', arg): LIPieces = int(val)
	
	# Important parameters
	elif re.match('readlen|^rl$', arg): maxreadlen = int(val)#; READLENBYUSER = True
	elif re.match('lambda|^l$', arg): Lambda = float(val)
	elif re.match('expphred|^e$', arg): expPhred = int(val)
	elif re.match('globalerror|^ge$', arg): ESTIMATEGLOBALERROR = True; ai-=1
	elif re.match('prior', arg): prior = val
	elif re.match('theta', arg): theta = float(val)
	elif re.match('colorspacetheta', arg): colorspacetheta = float(val)
	elif re.match('^stringency$|^Q$', arg): QvalueRange = map(int, val.split(','))
	
	# restricting genotyping for a map file with multiple samples
	elif re.match('^restrict|^rest$', arg): boundsfile = val
	elif re.match('^sampleid$|^id$', arg): singlesampleid = [val]
	elif re.match('^sampleids$|^ids$', arg): singlesampleid = map(lambda x: x.strip(), val.split(','))
	elif re.match('^subset|^sub$', arg): LOWSID,HIGHSID = map(int,val.split(','))
	
	# postprocessing settings
	elif re.match('^maxalpha$|^mxa$', arg): MAXALPHA = float(val)
	elif re.match('^minalpha$|^mna$', arg): MINALPHA = float(val)
	elif re.match('^postfilter$', arg): POSTFILTER = True; ai -=1
	elif re.match('^pfcutoff$', arg): DDEGCUTOFF = float(val)
	elif re.match('^repfilter$|^rf$', arg): REDUNDANCYFILTER = val
	
	# other settings
	elif re.match('redoall|^z$', arg): REDOALL = True; ai-=1
	elif re.match('^diagnostics$|^diag$', arg): DIAGNOSTICS = True; ai-=1
	elif re.match('noass|^na$', arg): SAVEASSEMBLY = False; ai-=1
	elif re.match('saveass|^sa$', arg): SAVEASSEMBLY = True; ai-=1
	elif re.match('^threads$|^p$', arg): nthreads = int(val)
	elif re.match('^srefs$|^sr$', arg): specialReferenceFiles = val # for --saveassembly only
	
	elif re.match('^nominal$|^noms$', arg): SAVENOMINAL = True; ai -=1
	
	elif re.match('^help|h$', arg.lower()): sys.exit(help)
	elif re.match('^version|v$', arg.lower()): sys.exit(version)
	
	else:
		astr = "=> The specified argument \""+arg+"\" does not parse. Try --help for more information."
		sys.exit(astr)
	ai += 2

# -------------------------------------------------------------------

# contingency of flags
# --------------------
if REDOALL:
	RUNBOWTIE = REALIGN = True
	ORGANIZEREADS = REORGANIZEREADS = True
	SNPCALLING = REDOSNPCALLING = REDOFINALSNPCALLING = True
	DIAGNOSTICS = True
	SAVEASSEMBLY = True

if ORGANIZEREADS and not bfaref: 
	sys.exit('Erorr: reference fasta file must be provided with -O using --ref/-r <REF.FASTA>.')

# declaration of directories
# --------------------------
zdir = outdir+zkey+'/'
if not SETMAPDIR: btmapdir = zdir+'read_maps_all/'
unmapdir = zdir+'unmapped_reads/'
pmadir = zdir+'alignment_mismatches/'
if UNIQ: 
	if not SETMAPDIR: btmapdir = zdir+'read_maps_uni/'
	unmapdir = zdir+'unmapped_reads_uni/'
	pmadir = zdir+'alignment_mismatches_uni/'
if BEST: 
	if not SETMAPDIR: btmapdir = zdir+'read_maps_best/'
	unmapdir = zdir+'unmapped_reads_best/'
	pmadir = zdir+'alignment_mismatches_best/'

zinsdir = zdir+'PE_insert_size/'

tmpstr = 'ALL'
if UNIQ: tmpstr = 'UNI'
if BEST: tmpstr = 'BEST'
resdir = zdir+'results-L%s,E%s,%s,%s/'%(Lambda, expPhred, USESINGLETONS and 'PESE' or 'PE', tmpstr)
del tmpstr

findir = resdir+'assembled_genomes/'
snpdir = resdir+'snps/'
phylodir = resdir+'SNPstrings/'
tmpdir = resdir+'tmp/'

# file suffix depending on map type
ubamap = '-all'
if UNIQ: ubamap = '-uniq'
if BEST: ubamap = '-best'

# create new directories
# ----------------------
createdir(outdir)
createdir(zdir)
createdir(btmapdir)
createdir(zinsdir)
createdir(unmapdir)
createdir(pmadir)


# save a log file with the user's input arguments
logfile = zdir+'log_file.txt'
iomethod = 'w'
if os.access(logfile, os.F_OK): iomethod = 'a'
fh = open(logfile, iomethod)
print >> fh, ' '.join(args)
fh.close()

# parse map file and group replicates
# ----------------------------------------
mapd,runs,head = readTable(mapfile,header=0)
seqRuns = {} # keyed by sampleID
ct = 0
totalsamples = 0
for i in range(len(runs)):
	ct += 1
	if MAPCUT and (ct < LOWSID or ct > HIGHSID): continue
	totalsamples += 1
	run,lane,sampleID,alias,refGenome,paired,path = runs[i], mapd[i][0], mapd[i][1], mapd[i][2], mapd[i][3], int(mapd[i][4]), mapd[i][5]
	
	if path == '': path = '/'
	else: path = slash(path)
	
	# make absolute path if local directory specified
	if path[0] != '/' or path == '/':
		abspath = slash(os.getcwd())
		# if this does not end at the current local path, add path
		if slash(abspath.split('/')[-2]) != path: abspath += path
		path = abspath
		
		path = slash(path)
		
	try:
		seqRuns[sampleID][(run,lane)] = [refGenome, paired, path]
	except KeyError:
		seqRuns[sampleID] = {(run,lane):[refGenome, paired, path]}

pipeit('- Read %s samples from map file.'%(totalsamples),1)

# ---------------------------------------------------------
sortseqruns = seqRuns.keys()
sortseqruns.sort()

if not MAPCUT:
	keeps = dict(map(lambda x: (x,None), sortseqruns[LOWSID:HIGHSID]))
	for sid in sortseqruns:
		if sid not in keeps:
			del seqRuns[sid]
			

if len(singlesampleid):
	sr2 = {}
	for sid in singlesampleid:
		if sid in seqRuns:
			bak = seqRuns[sid]
			sr2[sid] = bak
		else: print 'Warning: sample %s not found in map file.'%(sid)
	seqRuns = sr2

sortseqruns = seqRuns.keys()
sortseqruns.sort()

if LOWSID or HIGHSID:
	print '- Processing samples %s'%(sortseqruns[LOWSID:HIGHSID])
else:
	print '- Processing %s sampleIDs'%(len(sortseqruns))
# ---------------------------------------------------------

# load bounds file
boundslabel = ''
seqbounds = {}
if boundsfile and os.access(boundsfile, os.F_OK):
	d,r,c = readTable(boundsfile, header=False)
	for i in range(len(d)):
		tmp = []
		try: tmp += [intna(d[i][0])]
		except ValueError: tmp += [nastr]
		try: tmp += [intna(d[i][1])]
		except ValueError: tmp += [nastr]
		tmp += [d[i][2]] # flag - + or -
		
		try: seqbounds[r[i]] += [tmp]
		except KeyError: seqbounds[r[i]] = [tmp]
	
	parts = boundsfile.split('/')
	crap, boundslabel, crap = re.split('(.+)\..+$', parts[len(parts)-1])
	boundslabel = '.'+boundslabel
	
elif boundsfile:
	# this may also be a string of a particular bound: chr01, 500, 1000
	splt = boundsfile.split(',')
	if len(splt) == 1:
		loc = splt[0]
		seqbounds[loc] = [(nastr, nastr, '+')]
		# boundslabel = '.%s,%s,%s,%s'%(loc,nastr,nastr,'+')
		boundslabel = '.%s'%(loc)
	elif len(splt) == 3:
		loc, low, high = map(lambda x: x.strip(' '), splt)
		seqbounds[loc] = [(intna(low), intna(high), '+')]
		boundslabel = '.%s,%s,%s,%s'%(loc,low,high,'+')
	elif len(splt) == 4:
		loc, low, high,flag = map(lambda x: x.strip(' '), splt)
		seqbounds[loc] = [(intna(low), intna(high), flag)]
		boundslabel = '.%s,%s,%s,%s'%(loc,low,high,flag)
	else:
		sys.exit('Restriction flag not specified properly: '%(boundsfile))
	
# DONE INITIALIZATION OF VARIABLES


# =========================================================
# Part I: Align reads to reference genome
# =========================================================
parts = mapfile.split('/')
peinsfile = zinsdir+'PE_insertsize_table-%s'%(parts[len(parts)-1])
peinsize = {} # dict specifying PE insert size statistics, keyed by sampleID
if os.access(peinsfile, os.F_OK):
	d,r,c = readTable(peinsfile, header=True)
	for i in range(len(d)):
		peinsize[r[i]] = {}
		for j in range(len(c)):
			peinsize[r[i]][c[j]] = (d[i][j] != '' and floatna(d[i][j])) or nastr

if RUNBOWTIE or REALIGN:
	
	print '\nPart I: GENERATING READ MAPS USING BOWTIE...'
	
	# call bowtie on all samples in mapfile
	for sampleID in sorted(seqRuns.keys()):
		
		# set parm str each time here because insert size may be sample specific
		try:
			minins1 = peinsize[sampleID]['min'+insstat]
			maxins1 = peinsize[sampleID]['max'+insstat]
			if minins1 not in [nastr, '']: minins = minins1
			if maxins1 not in [nastr, '']: maxins = maxins1
		except KeyError: pass
		
		print '- ID %s: if paired-end, min/max insert size = [%s,%s]'%(sampleID, minins, maxins)
		
		parmstr = '-t --pairtries 300 %s '%(qualstr)
		sinparmstr = '-t %s '%(qualstr)
		if readfileformat == 'fasta': parmstr += ' -f '; sinparmstr += ' -f '
		
		if trim5: 
			astr = '--trim5 %s '%(trim5)
			parmstr += astr
			sinparmstr += astr
		
		if trim3: 
			astr = '--trim3 %s '%(trim3)
			parmstr += astr
			sinparmstr += astr
		
		if alignmentPolicy == 'basic':
			# for the maq-like policy
			parmstr += '--minins %s --maxins %s -n %s -m 1'%(minins, maxins, mismatches['PE'])
			sinparmstr += '-n %s -m 1'%(mismatches['SE']) # for singleton samples
		
		elif alignmentPolicy == 'all':
			# end-to-k policy (-v) with all valid alignments reported (-a)
			parmstr += '--minins %s --maxins %s -v %s -a'%(minins, maxins, mismatches['PE'])
			sinparmstr += '-v %s -a'%(mismatches['SE']) # for singleton samples
		
		elif alignmentPolicy == 'bestk':
			parmstr += '--minins %s --maxins %s -v %s -k %s --best'%(minins, maxins, mismatches['PE'], maxnumalign)
			sinparmstr += '-v %s -k %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples
		
		elif alignmentPolicy == 'bestm':
			parmstr += '--minins %s --maxins %s -v %s -a -m %s --best'%(minins, maxins, mismatches['PE'], maxnumalign)
			sinparmstr += '-v %s -a -m %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples
		
		elif alignmentPolicy == 'maq-bestm':
			parmstr += '--minins %s --maxins %s -n %s -a -m %s --best'%(minins, maxins, mismatches['PE'], maxnumalign)
			sinparmstr += '-n %s -a -m %s --best'%(mismatches['SE'], maxnumalign) # for singleton samples
		
		elif alignmentPolicy == 'bestm-stratum':
			parmstr += '--minins %s --maxins %s -v %s -a -m %s --best --strata'%(minins, maxins, mismatches['PE'], maxnumalign)
			sinparmstr += '-v %s -a -m %s --best --strata'%(mismatches['SE'], maxnumalign) # for singleton samples
		
		elif alignmentPolicy == 'best-stratum':
			parmstr += '--minins %s --maxins %s -v %s -a --best --strata'%(minins, maxins, mismatches['PE'])
			sinparmstr += '-v %s -a --best --strata'%(mismatches['SE']) # for singleton samples
		
		# output in sam format
		parmstr += ' --sam'; sinparmstr += ' --sam'
		
		for run,lane in seqRuns[sampleID].keys():
			refGenome,paired,path = seqRuns[sampleID][(run,lane)]
			UID = '%s-%s-%s'%(sampleID,run,lane)
			
			# find the file that is indicated in the map file
			fastqFiles = sorted(getFiles(path, include=lane))
			# print 'TEST fastqfiles', fastqFiles
			# sys.exit()
			
			if paired:
				# remove extraneous files that do not have the _1 or _2 in them
				fastqFiles = filter(lambda x: ('_1' in x or '_2' in x) and True, fastqFiles)
				
				if len(fastqFiles) > 2:
					print >> sys.stderr, 'Warning: Found more than 2 matching read files. Using default s_lane_x_sequence.txt format.'
					fastqFiles = [path+'s_%s_1_sequence.txt'%(lane), path+'s_%s_2_sequence.txt'%(lane)]
				elif len(fastqFiles) < 2:
					print >> sys.stderr, 'Check read file names or path. Did not find 2 files for sample %s in path "%s": %s'%(lane, path, fastqFiles)
				
				call = '%sbowtie %s -p %s "%s" -1 "%s" -2 "%s" "%s%s.map" --un "%s%s_unfq.fq"'%(btdir,parmstr,nprocs,  refGenome, fastqFiles[0], fastqFiles[1], btmapdir, UID, btmapdir, UID)
				
				# call = '%sbowtie %s -p %s "%s" -1 "%ss_%s_1_sequence.txt" -2 "%ss_%s_2_sequence.txt" "%s%s.map" --un "%s%s_unfq.fq"'%(btdir,parmstr,nprocs, refGenome,path,lane,path,lane,btmapdir,UID,btmapdir,UID)
				
				print '- Assembling %s paired end data...'%(UID)
				print call
				if not DRYRUN and (not os.access(btmapdir+UID+'.map', os.F_OK) or REALIGN): 
					os.system(call)
				print
				
				# set this up to process the remaining singletons as well, and then merge the output files...
				# outfile is _unfq.fq
				if 1: # USESINGLETONS
					print UID, '- Assembling PE left-over singleton data...'
					for lane in [1,2]:
						call2 = '%sbowtie %s -p %s "%s" "%s" "%s" --un "%s"'%(btdir, parmstr, nprocs, refGenome, btmapdir+UID+'_unfq_%s.fq'%(lane), btmapdir+UID+'_singles_%s.map'%(lane), unmapdir+UID+'_unmapped_%s.fq'%(lane))
						print call2
						if not DRYRUN and (not os.access(btmapdir+UID+'_singles_%s.map'%(lane), os.F_OK) or REALIGN):
							os.system(call2)
						print
						# remove the input fq file, contents of which are in singles_map and unmapped
						os.system('rm -f "%s"'%(btmapdir+UID+'_unfq_%s.fq'%(lane)))
						
				else:
					# rename the --un files from the PE run as unmapped and move them into the unmapped folder
					for lane in [1,2]:
						call = 'mv %s %s'%(btmapdir+UID+'_unfq_%s.fq'%(lane), unmapdir+UID+'_unmapped_%s.fq'%(lane))
						os.system(call)
			else:
				if not COLORSPACE:
					if len(fastqFiles) > 2:
						print >> sys.stderr, 'Warning: Found more than 2 matching read files. Using default s_lane_x_sequence.txt format.'
						fastqFiles = [path+'s_%s_1_sequence.txt'%(lane), path+'s_%s_2_sequence.txt'%(lane)]
					elif len(fastqFiles) != 1:
						print >> sys.stderr, 'Check read file names or path. Did not find 1 file for sample %s in dir "%s": %s'%(lane, path, fastqFiles)
						
				# len(fastqFiles) != 1:
				# 	sys.exit('Check read file names. Did not find 1 file for sample %s: %s'%(lane, fastqFiles))
				
				call = '%sbowtie %s -p %s "%s" "%s" "%s%s.map" --un "%s%s_unmapped.fq"'%(btdir, sinparmstr, nprocs, refGenome, fastqFiles[0], btmapdir, UID, unmapdir, UID)
				
				# call = '%sbowtie %s -p %s "%s" "%ss_%s_sequence.txt" "%s%s.map" --un "%s%s_unmapped.fq"'%(btdir, sinparmstr, nprocs, refGenome, path, lane, btmapdir, UID, unmapdir, UID)
				if COLORSPACE:
					if len(fastqFiles) != 2:
						print >> sys.stderr, 'Check read file names or path. Did not find 2 files (.csfasta and .qual) for sample %s in file "%s": %s'%(lane,path, fastqFiles)
					
					# call = '%sbowtie %s -C -p %s "%s" -f "%ss_%s_sequence.csfasta" -Q "%ss_%s_sequence.qual" "%s%s.map" --un "%s%s_unmapped.fq" --snpfrac %s'%(btdir, sinparmstr, nprocs, refGenome, path, lane, path,lane, btmapdir, UID, unmapdir, UID, colorspacetheta)
					call = '%sbowtie %s -C -p %s "%s" -f "%s" -Q "%s" "%s%s.map" --un "%s%s_unmapped.fq" --snpfrac %s'%(btdir, sinparmstr, nprocs, refGenome, fastqFiles[0], fastqFiles[1], btmapdir, UID, unmapdir, UID, colorspacetheta)
					
				
				print '- Assembling original singleton data for %s...'%(UID)
				print call
				if not DRYRUN and (not os.access(btmapdir+UID+'.map', os.F_OK) or REALIGN): 
					os.system(call)
				print
			
		print '=> DONE', sampleID, '\n'
	print 'DONE PART I: READ ALIGNMENT FOR ALL SAMPLES'
	print

# =========================================================
# Part IB: Assay raw read coverage / Insert size stats
# =========================================================
# if COUNTREADS:
# 	print '- Loading reference genome...'
# 	G = readFasta(bfaref)
# 	# chromosome total lengths (bp)
# 	chromlengths = dict( map(lambda k: (k, len(G[k]['seq'])), G) )
# 	master_chroms = chromlengths.keys()
# 	
# 	readbphead = ['SampleID', 'Run', 'Lane', 'PE', 'Reads', 'Bases', 'Coverage']
# 	readbptab = []
# 	
# 	print 'Computing number of reads for samples...'
# 	for sampleID in sorted(seqRuns.keys()):
# 		for run,lane in seqRuns[sampleID].keys():
# 			refGenome,paired,path = seqRuns[sampleID][(run,lane)]
# 			# count up reads and bases
# 			if paired:
# 				for pe in [1]: # don't need the second because it has same number of reads
# 					call = "wc -l %s > %s"%(path+'s_'+lane+'_'+str(pe)+'_sequence.txt', outdir+'tmpread.txt')
# 					print 'call', call
# 					os.system(call)
# 					line = readList(outdir+'tmpread.txt')
# 					nreads, crap = line[0].split(' ')
# 					nreads = float(nreads)/2.0 # each read takes up 2 rows in file
# 					row = [sampleID, run, lane, pe,'%.5e'%(float(nreads)), '%.5e'%(float(nreads)*readlen), '%.2f'%(float(nreads)*readlen/float(2*sum(chromlengths.values())))]
# 					readbptab += [row]
# 				os.system('rm "%s"'%(outdir+'tmpread.txt'))
# 			else:
# 				print 'have not implemented single end raw read counting yet...'
# 	printFormattedTable(readbptab, readbphead, file=zdir+'Raw reads summary-%s.txt'%(zkey))
# 	
# 	sys.exit('Done counting raw reads. Exiting...')
# print

if GETINSERTSIZE:
	# save an insert size histogram for each sampleID and a summary table providing .95, .99, and min/max stats
	# this only works in bowtie saved output as sam maps
	
	print 'Computing PE insert size distributions...'
	peinsize = []
	
	for sampleID in sorted(seqRuns.keys()):
		insvalues = []
		
		for run,lane in seqRuns[sampleID].keys():
			refGenome,paired,path = seqRuns[sampleID][(run,lane)]
			UID = '%s-%s-%s'%(sampleID,run,lane)
			
			# only used for paired end reads
			if paired:
				# get data from the sam/bam file
				fh = open(btmapdir+UID+'.map')
				for line in fh.readlines():
					if line[0]=='@': continue
					
					row = line.strip('\n').split('\t')
					# print 'row has', len(row)
					# print row
					insvalue = abs(int(row[8]))
					if insvalue != 0: insvalues += [insvalue]
				fh.close()
		
		insvalues = filter(lambda x: x!=nastr and True, insvalues)
		
		if len(insvalues) == 0: print '- Unable to read read map for %s'%(sampleID)
		else:
			print '- %s has %s values'%(sampleID, len(insvalues))
			print '  saving histogram...'
			hist = histogram(insvalues, nbins=25, yfreq=True)
			printTable(hist, ['Size', 'Count'], file=zinsdir+sampleID+' histogram.txt')
		
			print '  calculating statistics...'
			ansd = sd(insvalues)
			meanins = avg(insvalues)
			sdL = meanins - ansd; sdR = meanins + ansd
		
			ninefiveL = percentile(insvalues, .025)
			ninefiveR = percentile(insvalues, .975)
		
			ninenineL = percentile(insvalues, .005)
			ninenineR = percentile(insvalues, .995)
		
			mn = min(insvalues); mx = max(insvalues)
		
			perow = [sampleID, meanins, sdL, sdR, ninefiveL, ninefiveR, ninenineL, ninenineR, mn, mx]
		
			print perow
		
			peinsize += [perow]
	
	header = ['ID', 'Mean', 'minSD', 'maxSD', 'min.95', 'max.95', 'min.99', 'max.99', 'min', 'max']
	printTable(peinsize, header, file=peinsfile)
	
	print '=> Finished estimating insert size distributions.\n\nRerun sniper.py using the -ra flag to realign reads using these estimated insert size bounds.\n'
	sys.exit()

# ==========================================================
# Part II: generate final read files grouped by multiplicity
# ==========================================================
GR = None # we need the reference genome here...
flg = None
NEEDTOLOADREFERENCE = True

# if this flag is triggered, redo multiplicity filter and then perform SNP calling
if ORGANIZEREADS:
	print '\nPART II: READ ORGANIZATION'
	
	if NEEDTOLOADREFERENCE:
		sys.stdout.write(' - Loading reference genome... '); sys.stdout.flush()
		GR = readFasta(bfaref, TOUPPER=True)
		flg = sum(map(lambda x: len(x['seq']), GR.values()))
		sys.stdout.write('N=%s loci\n'%(flg))
		NEEDTOLOADREFERENCE = False
	
	GRlengths = dict( map(lambda x: (x, len(GR[x]['seq'])), GR.iterkeys()) )
	
	# touch new summary tables
	pmafile = zdir+'Summary-Perfect vs mismatch alignments%s.txt'%(ubamap)
	if not os.access(pmafile, os.F_OK):
		header=['Sample', 'Perfect alignments', 'Mismatched alignments', 'Total alignments', 'Frac perfect', 'Frac mismatch']
		record('\t'.join(header), pmafile, 'w')
	
	almfile = zdir+'Summary-alignment multiplicity%s.txt'%(ubamap)
	if not os.access(almfile, os.F_OK):
		record('Sample: SE and PE multiplicity distributions (alignments, reads)', almfile, 'w')
	
	crdfile = zdir+'Summary-chromosome read distribution.txt'
	if not os.access(crdfile, os.F_OK):
		record('SampleID\tChrom\tUnique SE\tUnique PE\tNon-unique SE\tNon-unique PE', crdfile, 'w')
		
	
	currentsample = 0
	for sampleID in sorted(seqRuns.keys()):
		
		sys.stdout.write(' - Looking at sample: %s\n'%(sampleID)); sys.stdout.flush()
		
		if (not REORGANIZEREADS) and os.access(btmapdir+sampleID+'.PE.unique.map', os.F_OK):
			sys.stdout.write('  Already processed.\n'); sys.stdout.flush()
			continue
		
		# record read multiplicity - number of times each read appears
		# ------------------------------------------------------------
		print ' - Assessing read multiplicity...'
		# addend = {'PE':.5, 'SE':1.0} # used as a lookup below
		
		
		# per sampleID variables of interest
		npmms = 0; nrows = 0 # count up total rows and total rows with mismatches
		totreads = {'SE':0, 'PE':0, 'Total':0}
		totaln = {'SE':0, 'PE':0, 'Total':0}
		tmpTotal = {} # aggregate into total histogram
		
		
		# for each sampleID save new SAM-format read files
		# Partition reads into 2 multiplicity groups
		funi = open(btmapdir+sampleID+'.PE.unique.map', 'w')
		funiSE = open(btmapdir+sampleID+'.SE.unique.map', 'w')
		fdeg = open(btmapdir+sampleID+'.PE.degenerate.map', 'w')
		fdegSE = open(btmapdir+sampleID+'.SE.degenerate.map', 'w')
		
		ulinesat = {'PE':{}, 'SE':{}}
		dlinesat = {'PE':{}, 'SE':{}}
		isizes = []
		
 		# the list of (run,lane) pairs of samples corresponding to this sampleID
		thesamples = seqRuns[sampleID].keys(); thesamples.sort()
		
		# potentially distribute this across CPU?
		
		for run,lane in thesamples:
			# multiplicity counting by read
			# key is the hitid, value is count, both reads from a mate pair count at 1
			RID = {'SE':{}, 'PE':{}}
			
			currentsample += 1
			
			UID = '%s-%s-%s'%(sampleID,run,lane)
			refGenome,paired,path = seqRuns[sampleID][(run,lane)]
			rlp = '%s,%s-'%(run,lane) # run,lane prefix for all reads
			
			fpairs = []
			if paired == 1:
				# filename for standard PE run
				fpairs += [('PE', btmapdir+UID+'.map')]
				# filenames for singleton leftovers from PE
				fpairs += [('SE', btmapdir+UID+'_singles_1.map')]
				fpairs += [('SE', btmapdir+UID+'_singles_2.map')]
			else:
				# filename for singleton run
				fpairs += [('SE', btmapdir+UID+'.map')]
				
			for pese,fname in fpairs:
				if os.access(fname, os.F_OK):
					parts = fname.split('/')
					sys.stdout.write(' - %s Counting alignments for %s...'%(currentsample, parts[len(parts)-1]))
					sys.stdout.flush()
					fh = open(fname)
					count = 0
					for row in fh:
						if row[0]=='@': continue
						if count % 5000000 == 0: sys.stdout.write('%s,'%(count)); sys.stdout.flush()
						
						count += 1
						lst = row[:-1].split('\t')
						try:
							hitid = lst[0]
							chrom = lst[2]
							rem = ' '.join(lst[11:len(lst)])
						except IndexError: continue
						if chrom == '*': continue
						
						# record read ids with multiplicity
						# base = hitid.strip('/1').strip('/2')
						# hitid = rlp+base # this id is unique across runs
						# try: RID[pese][hitid] += addend[pese] # don't double count a PE read
						# except KeyError: RID[pese][hitid] = addend[pese]
						
						# record read ids with multiplicity
						hitid = rlp+hitid # this id is globally unique across runs
						
						# replace hitid with unique integer value
						hashedid = hash(hitid)
						
						# each read of a paired end read is considered to be distinct
						try: RID[pese][hashedid] += 1
						except KeyError: RID[pese][hashedid] = 1
						
						# record mismatches
						tmp = re.split('.*NM:i:(\d+).*', rem)
						npmms += (len(tmp)>1 and int(tmp[1])>0 and 1) or 0
						nrows += 1
						
					fh.close()
					sys.stdout.write('Done.\n'); sys.stdout.flush()
				else: print 'Cannot access', fname
			# end loop reading read info
			# RID is now populated
			# -------------------------------
			
			# reorganize RIDSE and RIDPE as dictionaries keyed by multiplicity
			# to quickly generate a histogram
			# sys.stdout.write(' - Computing multiplicity histogram...\n'); sys.stdout.flush()
			# RIDm = {'SE':dict(map(lambda x: (x,0), range(1,maxnumalign+1))),
			#         'PE':dict(map(lambda x: (x,0), range(1,maxnumalign+1)))}
			
			# quickly generate histograms of multiplicity
			hist = {'SE':[], 'PE':[], 'Total':[]}
			
			# RID is keyed by SE or PE returning a dictionary of hitid,count pairs
			if SKIPHIST: 
				sys.stdout.write('skipping...too many reads\n'); sys.stdout.flush()
			else:
				for sp,ridsp in RID.items():
					RIDm = dict(map(lambda x: (x,0), xrange(1,maxnumalign+1)))
					
					sys.stdout.write(' %s...'%(sp)); sys.stdout.flush()
					for hitid,val in ridsp.iteritems():
						try: RIDm[val] += 1
						except KeyError: RIDm[val] = 1
						
					hist[sp] = map(lambda val: (val,RIDm[val]), RIDm.keys())
					
					totreads[sp] += sum([foo[1] for foo in hist[sp]])
					totaln[sp] += sum([foo[0]*foo[1] for foo in hist[sp]])
					
					sys.stdout.write('DONE.'); sys.stdout.flush()
					del RIDm
					
			totreads['Total'] += totreads['SE']+totreads['PE']
			totaln['Total'] += totaln['SE']+totaln['PE']
			
			for sp in ['SE', 'PE']:
				for val,vct in hist[sp]:
					try: tmpTotal[val] += vct
					except KeyError: tmpTotal[val] = vct
			
			
			# for each sampleID save a new SAM-format read file for unique and non-unique reads
			# ---------------------------------------------------------------------------------
			pipeit('\n - Saving partitioned read maps...', 1)
			
			# for run,lane in subsamples:
				# UID = '%s-%s-%s'%(sampleID,run,lane)
				# refGenome,paired,path = seqRuns[sampleID][(run,lane)]
				# rlp = '%s,%s-'%(run,lane) # run,lane prefix for all reads
				
			thefiles = [btmapdir+UID+'.map']
			if os.access(btmapdir+UID+'_singles_1.map', os.F_OK):
				thefiles += [btmapdir+UID+'_singles_1.map']
			if os.access(btmapdir+UID+'_singles_2.map', os.F_OK):
				thefiles += [btmapdir+UID+'_singles_2.map']
				
			for fn in thefiles:
				label = tail(fn.split('/'))
				sys.stdout.write(' - Processing %s...'%(label)); sys.stdout.flush()
				fh = open(fn)
				rowcnt = 0
				for row in fh:
					if rowcnt % 2500000 == 0: sys.stdout.write('%s,'%(rowcnt)); sys.stdout.flush()
					rowcnt += 1
					linen = row[:-1]#.strip('\n')
					prelinen = rlp+linen
					
					# only header to keep is the @PG line which indicates bowtie call
					# if row[0:3]=='@SQ': continue
					if row[0:3]=='@PG': print>>funi, linen; print>>fdeg, linen; continue
					elif row[0]=='@': continue
					
					lst = linen.split('\t')
					hitid = rlp+lst[0]
					chrom = lst[2]
					hasmate = lst[6]; insertsize=int(lst[8])
					seq=lst[9]; qual=lst[10]
					
					# toss unmapped reads
					if chrom == '*': continue
					
					# don't double count, so only record positive values
					# only keep a fraction of them to save on memory
					if hasmate != '*' and insertsize >= 0 and random.random < 0.1: 
						isizes += [insertsize]
					
					# save read by its multiplicity
					hashedid = hash(hitid)
					if hashedid in RID['SE']: 
						if RID['SE'][hashedid] == 1: 
							print >>funiSE, prelinen
							# get our own line count for proper sam file format
							try: ulinesat['SE'][chrom] += 1
							except KeyError: ulinesat['SE'][chrom] = 1
						else: 
							print >>fdegSE, prelinen
							# get our own line count for proper sam file format
							try: dlinesat['SE'][chrom] += 1
							except KeyError: dlinesat['SE'][chrom] = 1
							
					elif hashedid in RID['PE']:
						if RID['PE'][hashedid] == 1: 
							print >>funi, prelinen
							# get our own line count for proper sam file format
							try: ulinesat['PE'][chrom] += 1
							except KeyError: ulinesat['PE'][chrom] = 1
							
						else: 
							print >>fdeg, prelinen
							# get our own line count for proper sam file format
							try: dlinesat['PE'][chrom] += 1
							except KeyError: dlinesat['PE'][chrom] = 1
				sys.stdout.write('Done.\n'); sys.stdout.flush()
				fh.close()
				
		# END LOOPING OVER FILES
		
		# Save results for this sampleID
		# move on to next group if there are no mapped reads
		if nrows == 0: continue
		
		# record number of perfect/mismatch alignments
		perfrac = 0
		nper = nrows - npmms
		row = []
		perfrac = nper/float(nrows)
		mmfrac = npmms/float(nrows)
		row += ['There are %s/%s (%s) perfect (%s/%s) alignments'%(nper,nrows,perfrac,maxreadlen,maxreadlen)]
		row += ['There are %s/%s (%s) mismatched (<%s/%s) alignments'%(npmms,nrows,mmfrac,maxreadlen,maxreadlen)]
		printList(row, file=pmadir+'Perfect vs mismatch alignments-%s.txt'%(sampleID))
		
		# save perfect/mismatch reads summary
		info = '\t'.join(map(str, [sampleID, nper, npmms, nrows, perfrac, mmfrac]))
		record(info, pmafile, 'a')
		
		# save read and multiplicity statistics
		sk = tmpTotal.keys(); sk.sort()
		hist['Total'] = map(lambda val: (val, tmpTotal[val]), sk)
		sestr = ', '.join('('+','.join(map(str,p))+')' for p in hist['SE'])
		pestr = ', '.join('('+','.join(map(str,p))+')' for p in hist['PE'])
		totstr = ', '.join('('+','.join(map(str,p))+')' for p in hist['Total'])
		del hist # clean up
		if sestr == '': sestr = nastr
		if pestr == '': pestr = nastr
		
		recfile = almfile
		record(sampleID, recfile, 'a')
		record('SE: reads=%s, alignments=%s, alignments/read=%s'%(totreads['SE'], totaln['SE'], roundna(3)(divna(totaln['SE'],totreads['SE']))), recfile, 'a')
		record('Histogram: '+sestr, recfile, 'a')
		record('PE: reads=%s, alignments=%s, alignments/read=%s'%(totreads['PE'], totaln['PE'], roundna(3)(divna(totaln['PE'], totreads['PE']))), recfile, 'a')
		record('Histogram: '+pestr, recfile, 'a')
		record('Total: reads=%s, alignments=%s, alignments/read=%s'%(totreads['Total'], totaln['Total'], roundna(3)(divna(totaln['Total'], totreads['Total']))), recfile, 'a')
		record('Histogram: '+totstr, recfile, 'a')
		record('', recfile, 'a')
		
		# Done saving statistics for this sampleID
		# ----------------------------------------------------
		
		funi.close(); fdeg.close()
		funiSE.close(); fdegSE.close()
		
		print ' - done partitioning unique and degenerate reads.'
		
		
		# ==============================================================
		# save the number of SE/PE reads on each chromosome
		fh = open(zdir+'Summary-chromosome read distribution.txt', 'a')
		# this prints the number of reads on each chromosome
		for k in sorted(GR.keys()):
			use = upe = dse = dpe = 0
			try: use = ulinesat['SE'][k]
			except KeyError: pass
			try: upe = ulinesat['PE'][k]
			except KeyError: pass
			try: dse = dlinesat['SE'][k]
			except KeyError: pass
			try: dpe = dlinesat['PE'][k]
			except KeyError: pass
			print >>fh, '\t'.join(map(str, [sampleID, k, use, upe, dse, dpe]))
		fh.close()
		# ==============================================================
		
		
		# =========================================
		# sort read maps (not currently implemented)
		# for pese in ['PE', 'SE']:
		# 	os.system('sort -t "\t" -k 2,3n "%s" > "%s"'%(btmapdir+sampleID+'.'+pese+'.unique.map', btmapdir+sampleID+'.'+pese+'.unique.map'))
		# 	os.system('sort -t "\t" -k 2,3n "%s" > "%s"'%(btmapdir+sampleID+'.'+pese+'.degenerate.map', btmapdir+sampleID+'.'+pese+'.degenerate.map'))
		# alternatively could use samtools for sorting...
		# =========================================
		
		sys.stdout.write(' - saving sam file headers...'); sys.stdout.flush()
		
		for pese in ['PE', 'SE']:
			# unique map
			fh = open(btmapdir+sampleID+'.'+pese+'.uheader.txt', 'w')
			print >>fh, '\t'.join(['@HD', 'VN:1.0', 'SO:unsorted'])
			
			for k,v in sorted(GRlengths.items()):
				print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)])
			
			fh.close()
			
			# concatenate header and body together
			call = 'cat "%s" >> "%s"'%(btmapdir+sampleID+'.'+pese+'.unique.map', btmapdir+sampleID+'.'+pese+'.uheader.txt')
			os.system(call)
			call = 'mv "%s" "%s"'%(btmapdir+sampleID+'.'+pese+'.uheader.txt', btmapdir+sampleID+'.'+pese+'.unique.map')
			os.system(call)
			
			# degenerate map
			fh = open(btmapdir+sampleID+'.'+pese+'.dheader.txt', 'w')
			print >>fh, '\t'.join(['@HD', 'VN:1.0', 'SO:unsorted'])
			
			# this prints the number of reads on each chromosome
			# for k,v in sorted(dlinesat[pese].items()):
				# print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)])
			
			for k,v in sorted(GRlengths.items()):
				print >>fh, '\t'.join(['@SQ', 'SN:'+k, 'LN:'+str(v)])
			
			fh.close()
			
			# concatenate header and body together
			call = 'cat "%s" >> "%s"'%(btmapdir+sampleID+'.'+pese+'.degenerate.map', btmapdir+sampleID+'.'+pese+'.dheader.txt')
			os.system(call)
			call = 'mv "%s" "%s"'%(btmapdir+sampleID+'.'+pese+'.dheader.txt', btmapdir+sampleID+'.'+pese+'.degenerate.map')
			os.system(call)
		
		sys.stdout.write('Done.\n'); sys.stdout.flush()
		
		sys.stdout.write(' - saving empirical insert size distribution...'); sys.stdout.flush()
		hist = histogram(isizes[0:100000], nbins=100)
		printTable(hist, header=['Size', 'Count'], file=zinsdir+'Insert size distribution.txt')
		sys.stdout.write('Done.\n'); sys.stdout.flush()
		
		# clean up
		del RID; del ulinesat; del dlinesat; del isizes
		
		# finally delete the precursor bowtie files
		# -----------------------------------------
		if not KEEPBOWTIE:
			for run,lane in seqRuns[sampleID].keys():
				UID = '%s-%s-%s'%(sampleID,run,lane)
				refGenome,paired,path = seqRuns[sampleID][(run,lane)]
				thefiles = [btmapdir+UID+'.map', btmapdir+UID+'_singles_1.map', btmapdir+UID+'_singles_2.map']
				for fn in thefiles: os.system('rm -f "%s"'%(fn))
				
	print 'DONE PART II: READ ORGANIZATION.\n'


if ESTIMATEGLOBALERROR:
	log = lambda x: math.log(x, 10)
	Q2P = lambda e: (e == 0 and 1-1e-16) or 10**(-e/10.) # avoid 1
	P2Q = lambda p: -10*log(p)
	A2P = dict( [ascii, Q2P(A2Q[ascii])] for ascii in A2Q.keys() )
	
	print 'Estimating global per-nucleotide sequencing error rate...'
	errorsum = 0
	ecount = 0
	# errors = []
	for mapfile in getFiles(btmapdir, include='unique'):
		print '- Loading', mapfile
		fh = open(mapfile)
		count = 1
		for line in fh:
			if count % 1000000 == 0: print '     reached', count
			count+=1
			
			try:
				lst = line[:-1].split('\t')
				chrom = lst[2]
				if chrom == '*': continue
				
				for x in map(lambda x: A2Q[x], lst[10]):
					errorsum += x
					ecount += 1
					
			# exception if a line beginning with @
			except IndexError: pass
			except ValueError: pass
		fh.close()
	
	print 'Mean error over %s positions is'%(ecount), errorsum/float(ecount), 'or', Q2P(errorsum/float(ecount))
	print
	sys.exit()
# DONE

# ====================================================
# Part III: Identify SNPs using bayesian SNP detection
# ====================================================
if SNPCALLING:
	print 'PART III: Genotyping'
	
	if not bfaref:
		sys.exit('Reference genome must be specified using --ref/-r <path/to/reference>')
	
	createdir(resdir); createdir(snpdir); createdir(tmpdir)
	
	sampleIDs = sorted(seqRuns.keys())
	newSampleIDs = []
	if len(singlesampleid):
		for sid in singlesampleid:
			if sid in sampleIDs:
				newSampleIDs += [sid]
			else:
				sys.exit('ID specified by --sampleid ID not found in map.')
	if len(newSampleIDs): sampleIDs = newSampleIDs; del newSampleIDs
	
	# Only load reference if re-estimating...
	if GR == None: NEEDTOLOADREFERENCE = True
	for sampleID in sampleIDs:
		motherfile = snpdir+sampleID+'.mother'   # genotype at all informative positions
		motherfile += boundslabel+'.txt'
		if not os.access(motherfile, os.F_OK) or REDOSNPCALLING: NEEDTOLOADREFERENCE = True
	
	if NEEDTOLOADREFERENCE:
		sys.stdout.write(' - Loading reference genome... '); sys.stdout.flush()
		GR = readFasta(bfaref, TOUPPER=True)
		flg = sum(map(lambda x: len(x['seq']), GR.values()))
		sys.stdout.write('N=%s loci\n'%(flg))
	
	thesnpdir = snpdir
	if DRYRUN: thesnpdir = tmpdir
	
	def dummyLooper(jobs=[]):
		for kwargs in jobs:
			job = threading.Thread(target=sniper, kwargs=kwargs)
			# job = multiprocessing.Process(target=sniper, kwargs=kwargs)
			job.run()
		return 1
	
	
	# which sampleIDs do we actually need to execute on?
	sids = []
	mofs = {}
	noms = {}
	for sid in sampleIDs:
		motherfile = thesnpdir+sid+'.mother'+boundslabel+'.txt'
		nominalfile = thesnpdir+sid+'.nominal'+boundslabel+'.txt'
		if not os.access(motherfile, os.F_OK) or REDOSNPCALLING:
			print ' - Genotyping sample %s...'%(sid)
			sids += [sid]
			mofs[sid] = motherfile
			noms[sid] = (SAVENOMINAL and nominalfile) or ''
		else: 
			print ' - Already processed/processing sample %s: SKIP'%(sid)
			
	jpt = int(math.ceil(len(sids)/float(nthreads)))
	threadJobs = [[] for i in range(nthreads)]
	
	ct = 0
	for sampleID in sids:
		# set up SNP calling for PE vs PE+SE / unique reads vs unique + deg reads
		pref = btmapdir+sampleID
		rf = {'PEuni':pref+'.PE.unique.map', 'PEdeg':'', 'SEuni':'', 'SEdeg':''}
		
		if USESINGLETONS:
			rf['SEuni'] = pref+'.SE.unique.map'
			if not UNIQ and not BEST: 
				rf['PEdeg'] = pref+'.PE.degenerate.map'
				rf['SEdeg'] = pref+'.SE.degenerate.map'
		else:
			if not UNIQ and not BEST:
				rf['PEdeg'] = pref+'.PE.degenerate.map'
				
		
		# NO LONGER NECESSARY - THIS IS NOW GENERALIZED
		# estimate read length from the first line of the map file
		# if not READLENBYUSER:
		# 	rls = []
		# 	for foo in rf.keys():
		# 		if len(rf[foo]):
		# 			foofh = open(rf[foo])
		# 			for line in foofh:
		# 				try:
		# 					lst = line[:-1].split('\t')
		# 					chrom = lst[2]
		# 					if chrom == '*': continue
		# 					rls += [len(lst[9])]
		# 					break
		# 				# exception if a line beginning with @
		# 				except IndexError: pass
		# 				except ValueError: pass
		# 			foofh.close()
		# 	
		# 	if len(unique(rls)) == 1:
		# 		readlen = rls[0]
		# 		print ' - Detected read length of %s nt.'%(readlen)
		# 	else:
		# 		sys.exit('Multiple read lengths detected in read map files. User must specify proper length using --readlength flag.')
		# 
		
		
		kwargs = {'readfiles':rf, 'GR':GR, 'saveall':mofs[sampleID], 'outfile':noms[sampleID], 'expPhred':expPhred, 'maxreadlen':maxreadlen, 'Lambda':Lambda, 'prior':prior, 'theta':theta, 'tmpdir':tmpdir, 'bounds':seqbounds, 'LIPieces':LIPieces}
		
		if len(threadJobs[ct]) < jpt: threadJobs[ct] += [kwargs]
		else: ct += 1; threadJobs[ct] += [kwargs]
	
	for group in threadJobs:
		job = threading.Thread(target=dummyLooper, kwargs={'jobs':group})
		# job = multiprocessing.Process(target=dummyLooper, kwargs={'jobs':group})
		job.start()
	
	while threading.activeCount() > 1: time.sleep(1)
	
	for sampleID in sampleIDs:
		
		print ' - Identifying significant SNPs for %s...'%(sampleID)
		
		motherfile = thesnpdir+sampleID+'.mother'   # genotype at all informative positions
		motherfile += boundslabel+'.txt'
		
		# nominalfile = None
		# if SAVENOMINAL:
		# 	nominalfile = thesnpdir+sampleID+'.nominal' # nominally significant snp variants
		# 	nominalfile += boundslabel+'.txt'
		
		# first check if we need to recompute anything
		checkups = []
		for minPvalue in QvalueRange:
			# multiple-test corrected snp variants
			snpfile = thesnpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue)
			if os.access(snpfile, os.F_OK) and not REDOFINALSNPCALLING: continue
			checkups += [(minPvalue,snpfile)]
		checkups.sort()
		
		qccheck = lambda line: len(line[:-1].split('\t'))==11 and True
		
		infh = None
		if len(checkups)>0:
			# first get total number of hypotheses from mother and store data
			print ' - Counting number of genomic positions evaluated from mother SNP file...'
			try: infh = open(motherfile)
			except IOError: print 'Cannot access:', motherfile, '...SKIP'
		
		if infh:
			data = filter(qccheck, infh)
			infh.close()
			numHypos = len(data)
			
			# if mother file contained no information
			if numHypos == 0: numHypos = 1
			print ' - There are %s hypotheses...'%(numHypos)
			
			refilt = {}
			if REDUNDANCYFILTER and os.access(REDUNDANCYFILTER, os.F_OK):
				# print 'REFILTERING'
				fh = open(REDUNDANCYFILTER)
				for line in fh:
					entries = map(lambda x: x.split(','), line[:-1].split('\t'))
					# what are all targets
					offset = 0
					tlist = map(lambda x: [x[0],int(x[1])+offset], entries)
					for c,p,l in entries: refilt[c,int(p)+offset] = tlist
				fh.close()
			
			# now we can post-filter the significant set 
			# based on the actual number of hypotheses we made
			for Pvalue,snpfile in checkups:
				pipeit(' - Q%s...'%(Pvalue))
				# generate list of significant genotypes
				snpfile = thesnpdir+sampleID+boundslabel+'.Q%s.txt'%(Pvalue)
				outfh = open(snpfile, 'w')
				print >>outfh, '# Total positions tested: %s'%(numHypos)
				keepers = []
				jointmaybes = {}
				for i in xrange(len(data)):
					row = data[i][:-1].split('\t')
					tchrm = row[0]; tpos = int(row[1])
					Q = (row[4]==nastr or row[4]=='NA' and nastr) or float(row[4])
					ref = row[2]; geno = row[3]; alpha = float(row[8])
					genoref = (prior != 'haploid' and iupac2nts(ref)) or ref
					
					if genoref != geno and Q != nastr:
						# check for SNP candidacy
						ISCANDIDATE = False
						
						if (not MAXALPHA and Q >= Pvalue) or (MAXALPHA and alpha <= MAXALPHA and alpha >= MINALPHA and Q >= Pvalue):
							ISCANDIDATE = True
						
						elif POSTPROCESSING:
							# basically, if we have high depth and at most 10:1 deg to unique, 
							# then ignore the quality scores at this position
							# moreover if AAFD is close to 50:50 or 100:0 but MAP genotype does not
							# reflect this, then fix the genotype
							AAFD = sorted(map(lambda y: (float(y[1]),y[0]), map(lambda x: x.split(':'), row[10].split(','))))
							depth = int(row[5])
							alpha = float(row[8])+1e-10
							
							# basically this says ignore the P-value if the depth and degeneracy are ok
							# realistically this is just like using maq except we have degeneracy values
							if (depth >= FDEPTH[0] and depth/alpha >= FDEG[0] and Q >= FQ[0]) or (depth >= FDEPTH[1] and depth/alpha >= FDEG[1] and Q >= FQ[1]):
								ISCANDIDATE = True
								
						if POSTFILTER:
							depth = int(row[5]); ddeg = int(row[6])
							if float(ddeg)/depth > DDEGCUTOFF: ISCANDIDATE = False
						
						# find SNPs that have template redudndancy at this Pvalue
						if ((not MAXALPHA and Q >= Pvalue) or (MAXALPHA and alpha <= MAXALPHA and alpha >= MINALPHA and Q >= Pvalue)) and (tchrm,tpos) in refilt: 
							ISCANDIDATE = False
							jointmaybes[(tchrm,tpos)] = [row,i]
							
						# FINALLY SAVE SNP IF STILL GOOD
						if ISCANDIDATE:
							row[9] = 'Y'
							print >>outfh, '\t'.join(row)
							# since this is largest Pvalue, we continue filtering rows...
							# this will accelerate future scans
							keepers += [data[i]]
							
				# update keepers with subset of jointmaybes
				# this is a list of the positions - cluster these into their redundancy groups
				# then choose the best 1 from each group
				candfile = thesnpdir+sampleID+boundslabel+'.candidate.Q%s.txt'%(Pvalue)
				grouping = {}
				groupkeeps = 0
				# at each position, check for snps at redundant positions
				for tchrm,tpos in jointmaybes.keys():
					
					# first, we want to isolate positions with the same snp call
					tmpdct = {} # group redundant snps by genotype
					for x in refilt[(tchrm,tpos)]:
						try: 
							tmp = jointmaybes[tuple(x)]
							entry = (int(tmp[0][4]), float(tmp[0][8]), tmp)
							try: tmpdct[tmp[0][3]] += [entry]
							except KeyError: tmpdct[tmp[0][3]] = [entry]
						except KeyError: pass
					
					# treat each genotype separately - we want to avoid replicate calls
					for cons,tmpobj in tmpdct.iteritems():
						tmplist = tmpobj[:] # list of positions with same genotype
						q = None; row = None; i = None
						
						# group calls by quality value
						qdct = {}
						for q,alpha,(row,i) in tmplist:
							try: qdct[q] += [(alpha,row,i)]
							except KeyError: qdct[q] = [(alpha,row,i)]
						maxq = sorted(qdct.keys())[-1] # get maximum q group
						
						# group snps of max quality by alpha
						adct = {}
						for alpha,row,i in qdct[maxq]:
							try: adct[alpha] += [(row,i)]
							except KeyError: adct[alpha] = [(row,i)]
						# keep the snp if there is a single best alpha
						minalpha = sorted(adct.keys())[0]
						
						tmplist = []
						for ttlist in adct.values(): tmplist += ttlist
						
						# temporarily disabled
						if len(tmplist) == 1:
							row,i = tmplist[0]
							row[9] = 'Y'
							print >>outfh, '\t'.join(row)
							keepers += [data[i]]
							groupkeeps += 1
						# else:
						# 	# return the remainder
						# 	if len(tmplist): print >> outfh2, tmplist
						
				nsig = len(keepers)
				data = keepers
				
				outfh.close()
				pipeit(': %s/%s significant SNP variants at Q > %s (P < %s).'%(nsig, numHypos, Pvalue, 10**(-Pvalue/10.)),1)
	
	os.system('rm -rf "%s"'%(tmpdir))
	print '\nResults saved to %s'%(thesnpdir)
	print 'DONE PART III: GENOTYPING.\n'

# ====================================================
# Part IV: Diagnostics
# ====================================================
if DIAGNOSTICS:
	
	print 'PART IV: DIAGNOSTICS...'
	
	# use the alloutfile results from SNP calling to gather diagnostics
	# including the following:
	# number of reference positions for which we have no read data
	# histogram of total positions with x read depth
	# histogram of posterior probabilities for all positions and significant positions
	
	stats = { 'Ns':{}, 'numHypos':{}, 'Qns':{}, 'DH':{}, 'Davg':{}, 'alpha':{}}
	
	desc = {'sigG':'Distribution of significant genotypes',
			'Gtype':'Distribution of significant categorical genotypes',
			'Ns':  'Reference positions with no read data',
			'numHypos':  'Positions with no information (depth=0 or ref=N)',
			'Qns': 'Distribution of all posterior probabilities',
			'Qs':  'Distribution of significant posterior probabilities',
			'DH':  'Distribution of genome-wide read depth',
			'Davg': 'Genome-wide average read depth (avg. depth per position)',
			'sigDavg': 'Average read depth (avg. depth per position) for significant positions',
			'alpha': 'Genome-wide degenerate/unique alignment ratio statistics',
			'sigalpha': 'Significant SNP degenerate/unique alignment ratio statistics'}
	
	# finish initializing sig entries
	Pstats = {}
	for minPvalue in map(str,QvalueRange):
		Pstats[minPvalue] = {'sigG':{}, 'Qs':{}, 'sigDavg':{}, 'sigalpha':{}, 'Gtype':{}}
	
	diagfile = resdir+'Summary-diagnostics'
	diagfile += ubamap
	if POSTPROCESSING: diagfile += '-rcf'
	diagfile += '.txt'
	
	fh = open(diagfile, 'w')
	fh.close()
	
	for sampleID in sorted(seqRuns.keys()):
		
		motherfile = snpdir+sampleID+'.mother.txt'
		
		Ns = {}
		numHypos = {} # number of positions with information, by chromosome
		DH = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0,
		 	  11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0, 20:0,
			  40:0, 60:0, 80:0, 100:0, 200:0, 300:0, 400:0, 500:0}
		DHints = sorted(DH.keys())
		Dsum = 0; Dcount = 0; allDvals = []
		alphasum = 0; allalpha = []
		Qns = {0:0, 5:0, 10:0, 15:0, 20:0, 25:0, 30:0, 35:0, 40:0,
			   44:0, 50:0, 55:0, 60:0, 65:0, 70:0, 75:0, 80:0,
			   85:0, 90:0, 95:0, 100:0, 105:0, 110:0, 115:0, 120:0,
			   125:0, 130:0, 135:0, 140:0, 145:0, 150:0, 155:0, 160:0}
		Qints = sorted(Qns.keys())
		
		print ' - Collecting statistics from %s mother file...'%(sampleID)
		fh = open(motherfile)
		for line in fh:
			row = line[:-1].split('\t')
			
			if len(row) != 11: continue
			
			C = row[0]
			P = int(row[1])
			R = row[2]
			G = row[3]
			Q = row[4]=='NA' and row[4] or int(row[4])
			D = int(row[5])
			M = int(row[6])
			TM = int(row[7])
			alpha = float(row[8])
			S = row[9] # Y/N
			
			Dsum += D
			Dcount += 1
			allDvals += [D]
			# ratio of unique alignments at this position against total degenerate alignments
			alphasum += alpha
			allalpha += [alpha]
			
			try: numHypos[C] += 1
			except KeyError: numHypos[C] = 1
			
			try: DH[D] += 1
			except KeyError: 
				# then it is between intervals
				idx = 0
				while idx < len(DHints) and D>DHints[idx]:idx += 1
				if idx == len(DHints): idx -= 1
				DH[DHints[idx]] += 1
			
			if G == 'NN':
				try: Ns[C] += 1
				except KeyError: Ns[C] = 1
			
			if Q != 'NA':
				try: Qns[Q] += 1
				except KeyError: 
					try:
						idx = 0
						while Q>Qints[idx]: idx += 1
						Qns[Qints[idx]] += 1
					except IndexError:
						Qns[Qints[len(Qints)-1]] += 1
		fh.close()
		
		
		allalpha = filterstr(nastr)(map(float,allalpha))
		allDvals = filterstr(nastr)(map(float,allDvals))
		
		# save genome-wide statistics for this sampleID
		stats['Ns'][sampleID] = Ns
		stats['numHypos'][sampleID] = numHypos
		stats['Qns'][sampleID] = Qns
		stats['DH'][sampleID] = DH
		stats['Davg'][sampleID] = (divna(Dsum,Dcount), sd(allDvals), getQuants(allDvals))
		stats['alpha'][sampleID] = (divna(alphasum,Dcount), sd(allalpha), getQuants(allalpha))
		
		
		# collect stats for significant snps
		for minPvalue in QvalueRange:
			snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue)
			
			sigG = {}; Gtype = {'h0':0, 't1':0, 'h2':0, 't2':0}
			sigDsum = 0; sigDcount = 0; sigDvals = []
			sigalphasum = 0
			allsigalpha = []
			Qs = {0:0, 5:0, 10:0, 15:0, 20:0, 25:0, 30:0, 35:0, 40:0,
				   44:0, 50:0, 55:0, 60:0, 65:0, 70:0, 75:0, 80:0,
				   85:0, 90:0, 95:0, 100:0, 105:0, 110:0, 115:0, 120:0,
				   125:0, 130:0, 135:0, 140:0, 145:0, 150:0, 155:0, 160:0}
			
			print ' - Collecting statistics from significant SNPs file at FWER < %s...'%(minPvalue)
			# now pass through significant snps
			fh = open(snpfile)
			for line in fh:
				row = line[:-1].split('\t')
				
				if len(row) != 11: continue
				
				C = row[0]
				P = int(row[1])
				R = row[2]
				G = row[3]
				Q = row[4]=='NA' and row[4] or int(row[4])
				D = int(row[5])
				M = int(row[6])
				TM = int(row[7])
				alpha = float(row[8])
				S = row[9] # Y/N
				
				gg = list(G)
				gj = ''.join(sorted(gg))
				try: sigG[gj] += 1
				except KeyError: sigG[gj] = 1
				
				# homo/het/triallele
				if gg[0]==gg[1]:
					if gg[0]==R:
						Gtype['h0'] += 1
					else:
						Gtype['h2'] += 1
				else:
					if gg[0]!=R and gg[1]!=R:
						Gtype['t2'] += 1
					else:
						Gtype['t1'] += 1
				
				try: Qs[Q] += 1
				except KeyError: 
					try:
						idx = 0
						while Q>Qints[idx]: idx += 1
						Qs[Qints[idx]] += 1
					except IndexError:
						Qs[Qints[len(Qints)-1]] += 1
				
				sigalphasum += alpha
				allsigalpha += [alpha]
				
				sigDsum += D
				sigDcount += 1
				sigDvals += [D]
				
			fh.close()
			
			# save significant statistics for this sampleID at Pvalue
			PP = str(minPvalue)
			
			Pstats[PP]['Qs'][sampleID] = Qs
			Pstats[PP]['sigG'][sampleID] = sigG
			Pstats[PP]['Gtype'][sampleID] = Gtype
			Pstats[PP]['sigDavg'][sampleID] = (divna(sigDsum,sigDcount), sd(sigDvals), getQuants(sigDvals))
			Pstats[PP]['sigalpha'][sampleID] = (divna(sigalphasum, sigDcount), sd(allsigalpha), getQuants(allsigalpha))
			
		
		# print out a summary diagnostics file
		fh = open(diagfile, 'a')
		
		for k in sorted(stats.keys()):
			# print >>fh, 'Diagnostic:', desc[k]
			print >>fh, desc[k]
			for sid in sorted(stats[k].keys()):
				info = None
				# if k == 'Davg' or k == 'sigDavg':
					# info = str(round(stats[k][sid],3))
				if k == 'alpha' or k == 'Davg':
					tags = ('average', 'sd', 'quantiles')
					info = [str(roundna(3)(stats[k][sid][0])), str(roundna(3)(stats[k][sid][1])), stats[k][sid][2]]
					info = ', '.join([tags[i]+': '+info[i] for i in range(len(info))])
					print >>fh, '\t'.join([sampleID, k, info])
				else:
					info = ', '.join(map(lambda x: ':'.join(map(str, x)), sorted(stats[k][sid].items())))
					print >>fh, '\t'.join([sampleID, k, info])
					
			print >> fh
		
		print >> fh, 'Statistics for significant SNPs'
		print >> fh
		tags = ('average', 'sd', 'quantiles')
		for P in map(str,QvalueRange):
			for k in sorted(Pstats[P].keys()):
				info = None
				if k == 'sigalpha' or k == 'sigDavg':
					info = [str(roundna(3)(Pstats[P][k][sid][0])), str(roundna(3)(Pstats[P][k][sid][1])), Pstats[P][k][sid][2]]
					info = ', '.join([tags[i]+': '+info[i] for i in range(len(info))])
					# print >>fh, '\t'.join([sampleID, str(P), k, info])
				elif k == 'Qs' or k=='sigG' or k=='Gtype':
					info = ', '.join(map(lambda x: ':'.join(map(str, x)), sorted(Pstats[P][k][sid].items())))
				print >>fh, '\t'.join([sampleID, str(P), k, info])
			print >> fh
		
		fh.close()
		
		print 'DONE PART IV: DIAGNOSTICS.\n'
	

# ================
# Part V: ASSEMBLY
# ================
if SAVEASSEMBLY:
	print 'PART V: Building new genome assembly.'
	
	createdir(phylodir); createdir(findir)
	
	sortedSampleIDs = sorted(seqRuns.keys())
	
	# master copy
	# G = readFasta(bfaref, TOUPPER=True)
	
	if 1:
		# create a new assembly sequence including novel significant SNPs
		for sampleID in sortedSampleIDs:
			for minPvalue in QvalueRange:
				snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue)
				print ' - Saving new assembly genome...', sampleID, 'at Q >', minPvalue
				
				# load reference genome - G will be editied by new SNPs
				Gtmp = readFasta(bfaref, TOUPPER=True)
				
				# load significant SNPs
				# by right, this should be processed line by line
				if os.access(snpfile, os.F_OK):
					dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False)
					dct = {}
					for i in range(len(dersnps)):
						# row = dersnps[i][0]
						if dersnps[i][0][0] != '#':
							chrom = dersnps[i][0]
							pos = int(dersnps[i][1])
							refsnp = dersnps[i][2]
							cons = dersnps[i][3]
							allele = nts2iupac(list(cons)).upper()
							try: dct[chrom] += [(pos,allele)]
							except KeyError: dct[chrom] = [(pos,allele)]
						
					# edit reference genome using these SNPs
					for chrom in dct.keys():
						seq = list(Gtmp[chrom]['seq'])
					
						# change significant SNP positions
						try:
							for pos,allele in dct[chrom]: seq[pos] = allele
						except IndexError:
							print 'Can\'t fetch position: %s,%s | seq length is only %s'%(chrom,pos,len(seq))
						
						Gtmp[chrom]['seq'] = ''.join(seq)
					printFasta(Gtmp, findir+sampleID+boundslabel+'.Q%s.fasta'%(minPvalue))
				else:
					print '   - Cannot access snp file:', snpfile
					
		
	# save concatenated string of the called derived SNPs for each line
	# need to know ALL positions of SNPs across lines to get equal length multiple alignment
	for minPvalue in QvalueRange:
		cpd = {} # chrom,pos dict: keyed by chrom,pos pairs returning the ref snp at that position
		cpdhits = {}
		
		# record positions for all SNPs across all samples
		for sampleID in sortedSampleIDs:
			print ' - Generating SNP string...', sampleID, 'at Q >', minPvalue
			snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue)
			
			if os.access(snpfile, os.F_OK):
				dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False)
				for i in xrange(len(dersnps)):
					if dersnps[i][0][0] != '#':
						chrom = dersnps[i][0]
						pos = int(dersnps[i][1])
						refsnp = dersnps[i][2].upper()
						cpd[(chrom,pos)] = refsnp # store this as a position of interest
						
						# record number of lines with a SNP here
						try: cpdhits[(chrom,pos)] += 1
						except KeyError: cpdhits[(chrom,pos)] = 1
						
		# now we have all the positions that are hit by some line
		# save a concatenated snp string
		skeys = cpd.keys(); skeys.sort() # sorting same for all taxa
		# skeys = [(chrom,pos) for (chrom,pos) in sorted(cpd.keys())] # sorting same for all taxa
		
		snpstrings = []
		keepsids = [] # final list of sampleIDs (confirmed SNP file access)
		
		# first add the reference string
		snpstrings += [''.join(map(lambda k: cpd[k], skeys))] # cpd points to reference allele
		keepsids += ['Reference']
		
		# add alleles from a second external file
		if specialReferenceFiles:
			for srf in specialReferenceFiles.split(','):
				G2 = readFasta(srf, TOUPPER=True)
				snpstrings += [''.join([G2[chrom]['seq'][pos] for (chrom,pos) in skeys])]
				srname = srf.split('/')[-1].split('.')[0]
				keepsids += [srname]
			del G2
		
		for sampleID in sortedSampleIDs:
			snpfile = snpdir+sampleID+boundslabel+'.Q%s.txt'%(minPvalue)
			if os.access(snpfile, os.F_OK):
				keepsids += [sampleID]
				
				dersnps,foo,foo = readTable(snpfile, rownames=False, header=False, comments=False)
				tmp = {} # record SNP positions for this sample
				for i in xrange(len(dersnps)):
					if dersnps[i][0][0] != '#':
						chrom = dersnps[i][0]
						pos = int(dersnps[i][1])
						dersnp = nts2iupac(list(dersnps[i][3])).upper()
						tmp[(chrom,pos)] = dersnp
						
				snpstr = ''
				for cp in skeys: # recall, this is list of ALL SNP positions across samples
					refsnp = cpd[cp]
					# if derived snp in this line, use it, otherwise use the reference
					if cp in tmp: snpstr += tmp[cp]
					else: snpstr += refsnp
				snpstrings += [snpstr]
				
				fh = open(phylodir+sampleID+boundslabel+'.SNPstring.Q%s.txt'%(minPvalue), 'w')
				print >> fh, snpstr
				fh.close()
				
		# now save a multi-fasta and phylip file at this stringency level
		snpfasta = list2fasta(snpstrings, ids=keepsids)
		printFasta(snpfasta, file=phylodir+'SNPstring-P%s.fasta'%(minPvalue))
		fasta2Phylip(snpfasta, file=phylodir+'SNPstring-P%s.phylip.txt'%(minPvalue), infile='', verbose=False)
	print 'DONE PART V: BUILDING NEW GENOME ASSEMBLY.\n'
print '\nDONE SNIPER.PY'