56 lines
2.3 KiB
Python
56 lines
2.3 KiB
Python
|
|
import os
|
||
|
|
base1 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/broad/"
|
||
|
|
base2 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/intersect/"
|
||
|
|
|
||
|
|
def getField(fname,header):
|
||
|
|
f = open(fname)
|
||
|
|
h = f.readline().split("\t")
|
||
|
|
idx = h.index(header)
|
||
|
|
return map(lambda x: x.strip().split("\t")[idx],filter(lambda u: not u.startswith("Tot"),f.readlines()))
|
||
|
|
|
||
|
|
def getFiles(base):
|
||
|
|
num_jobs = len(os.listdir(base+"GATKDispatcher/"))
|
||
|
|
sum_file = base+"GATKDispatcher/dispatch%d/job%d.sample_summary"
|
||
|
|
int_file = base+"GATKDispatcher/dispatch%d/job%d.sample_interval_summary"
|
||
|
|
files = map( lambda x: (sum_file % (x,x), int_file % (x,x)), range(num_jobs))
|
||
|
|
return files
|
||
|
|
|
||
|
|
def getBases(base):
|
||
|
|
files = getFiles(base)
|
||
|
|
lines = map(lambda x: getField(x[1],"Target"),files)
|
||
|
|
sizes = map( lambda iList: reduce ( lambda u,v: u + v , map(lambda ival: int(ival.split(":")[1].split("-")[1])-int(ival.split(":")[1].split("-")[0]), iList ) ) , lines )
|
||
|
|
return(sizes)
|
||
|
|
|
||
|
|
def getPctAbove(base):
|
||
|
|
files = getFiles(base)
|
||
|
|
pct_above = map(lambda y: map(lambda z: float(z),y),map(lambda x: getField(x[0],"%_bases_above_20"),files))
|
||
|
|
return pct_above
|
||
|
|
|
||
|
|
def getTotals(base):
|
||
|
|
files = getFiles(base)
|
||
|
|
totals = map(lambda y: map(lambda z: int(z),y),map(lambda x: getField(x[0],"total"),files))
|
||
|
|
return totals
|
||
|
|
|
||
|
|
bases = getBases(base1) + getBases(base2)
|
||
|
|
totals = getTotals(base1) + getTotals(base2)
|
||
|
|
pct_above = getPctAbove(base1) + getPctAbove(base2)
|
||
|
|
files = getFiles(base1) + getFiles(base2)
|
||
|
|
|
||
|
|
total_bases = reduce(lambda u,v: u + v , bases)
|
||
|
|
|
||
|
|
total_cvg = map(lambda u: 0, range(len(pct_above[0])))
|
||
|
|
total_covered_20 = map(lambda u: 0, range(len(pct_above[0])))
|
||
|
|
for f_idx in range(len(files)):
|
||
|
|
for sam_idx in range(len(pct_above[0])):
|
||
|
|
total_covered_20[sam_idx] += pct_above[f_idx][sam_idx]*bases[f_idx]
|
||
|
|
total_cvg[sam_idx] += totals[f_idx][sam_idx]
|
||
|
|
|
||
|
|
pct_above_20 = map( lambda x: x/total_bases, total_covered_20)
|
||
|
|
mean_cvgs = map( lambda x: (x+0.0)/total_bases, total_cvg)
|
||
|
|
#print(total_bases)
|
||
|
|
#print(total_cvg)
|
||
|
|
print("Mean\t%_above_20")
|
||
|
|
mean_str = map(lambda u: "%.1f" % u, mean_cvgs)
|
||
|
|
above_str = map(lambda u: "%.1f" % u, pct_above_20)
|
||
|
|
print(reduce( lambda u,v: u + "\n" + v, map(lambda idx: mean_str[idx]+"\t"+above_str[idx],range(len(mean_str)))))
|