gatk-3.8/python/validatePosterior.py

48 lines
1.3 KiB
Python
Raw Normal View History

print("opening...")
ea_vcf = open("/humgen/gsa-hpprojects/analysis/privateMutations/eomi+autism/resources/callsets/eomi+autism/eomi+autism_batch.merged.vcf")
print("reading past header...")
ln = ea_vcf.readline()
import random
while (ln.startswith("#")):
ln = ea_vcf.readline()
numcalc = 0
nread = -1
def getAC(e):
if ( e.startswith("0/0") ):
return 0
elif ( e.startswith("0/1") or e.startswith("1/0") ):
return 1
elif ( e.startswith("1/1") ):
return 2
else:
print("Warning: "+e)
return 0
def calcTrans(line):
spline = line.strip().split("\t")
gtypes = filter(lambda y: y.find("./.") == -1, spline[9:len(spline)])
if ( len(gtypes) < 1800 ):
return (-1,-1)
random.shuffle(gtypes)
firstAC = reduce(lambda x,y: x + y , map(lambda u: getAC(u),gtypes[0:900]))
if ( firstAC > 5 ):
return (-1,-1)
secondAC = reduce(lambda x,y: x + y, map(lambda u: getAC(u),gtypes[900:1800]))
return (firstAC,secondAC)
print("Calculating...")
counts = filter(lambda u: u[0] > -1, map(lambda z: calcTrans(z) ,ea_vcf.readlines()))
print("Lines actually processed: %d" % len(counts))
cdict = dict()
for c in counts:
if ( not c in cdict ):
cdict[c] = 0
cdict[c] += 1
out = open("posterior_counts.txt",'w')
for c in cdict:
out.write("%d\t%d\t%d\n" % (c[0],c[1],cdict[c]))