gatk-3.8/python/validatePosterior.py

print("opening...")
ea_vcf = open("/humgen/gsa-hpprojects/analysis/privateMutations/eomi+autism/resources/callsets/eomi+autism/eomi+autism_batch.merged.vcf")
print("reading past header...")
ln = ea_vcf.readline()
import random
while (ln.startswith("#")):
    ln = ea_vcf.readline()

numcalc = 0
nread = -1

def getAC(e):
    if ( e.startswith("0/0") ):
        return 0
    elif ( e.startswith("0/1") or e.startswith("1/0") ):
        return 1
    elif ( e.startswith("1/1") ):
        return 2
    else:
        print("Warning: "+e)
        return 0

def calcTrans(line):
    spline = line.strip().split("\t")
    gtypes = filter(lambda y: y.find("./.") == -1, spline[9:len(spline)])
    if ( len(gtypes) < 1800 ):
        return (-1,-1)
    random.shuffle(gtypes)
    firstAC = reduce(lambda x,y: x + y , map(lambda u: getAC(u),gtypes[0:900]))
    if ( firstAC > 5 ):
        return (-1,-1)
    secondAC = reduce(lambda x,y: x + y, map(lambda u: getAC(u),gtypes[900:1800]))
    return (firstAC,secondAC)

print("Calculating...")
counts = filter(lambda u: u[0] > -1, map(lambda z: calcTrans(z) ,ea_vcf.readlines()))
print("Lines actually processed: %d" % len(counts))

cdict = dict()
for c in counts:
    if ( not c in cdict ):
        cdict[c] = 0
    cdict[c] += 1

out = open("posterior_counts.txt",'w')
for c in cdict:
    out.write("%d\t%d\t%d\n" % (c[0],c[1],cdict[c]))