diff --git a/python/gatherSampleSummaries.py b/python/gatherSampleSummaries.py new file mode 100644 index 000000000..371309340 --- /dev/null +++ b/python/gatherSampleSummaries.py @@ -0,0 +1,55 @@ +import os +base1 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/broad/" +base2 = "/humgen/gsa-hphome1/chartl/projects/exome/coverage/whole_exome_broad/intersect/" + +def getField(fname,header): + f = open(fname) + h = f.readline().split("\t") + idx = h.index(header) + return map(lambda x: x.strip().split("\t")[idx],filter(lambda u: not u.startswith("Tot"),f.readlines())) + +def getFiles(base): + num_jobs = len(os.listdir(base+"GATKDispatcher/")) + sum_file = base+"GATKDispatcher/dispatch%d/job%d.sample_summary" + int_file = base+"GATKDispatcher/dispatch%d/job%d.sample_interval_summary" + files = map( lambda x: (sum_file % (x,x), int_file % (x,x)), range(num_jobs)) + return files + +def getBases(base): + files = getFiles(base) + lines = map(lambda x: getField(x[1],"Target"),files) + sizes = map( lambda iList: reduce ( lambda u,v: u + v , map(lambda ival: int(ival.split(":")[1].split("-")[1])-int(ival.split(":")[1].split("-")[0]), iList ) ) , lines ) + return(sizes) + +def getPctAbove(base): + files = getFiles(base) + pct_above = map(lambda y: map(lambda z: float(z),y),map(lambda x: getField(x[0],"%_bases_above_20"),files)) + return pct_above + +def getTotals(base): + files = getFiles(base) + totals = map(lambda y: map(lambda z: int(z),y),map(lambda x: getField(x[0],"total"),files)) + return totals + +bases = getBases(base1) + getBases(base2) +totals = getTotals(base1) + getTotals(base2) +pct_above = getPctAbove(base1) + getPctAbove(base2) +files = getFiles(base1) + getFiles(base2) + +total_bases = reduce(lambda u,v: u + v , bases) + +total_cvg = map(lambda u: 0, range(len(pct_above[0]))) +total_covered_20 = map(lambda u: 0, range(len(pct_above[0]))) +for f_idx in range(len(files)): + for sam_idx in range(len(pct_above[0])): + total_covered_20[sam_idx] += pct_above[f_idx][sam_idx]*bases[f_idx] + total_cvg[sam_idx] += totals[f_idx][sam_idx] + +pct_above_20 = map( lambda x: x/total_bases, total_covered_20) +mean_cvgs = map( lambda x: (x+0.0)/total_bases, total_cvg) +#print(total_bases) +#print(total_cvg) +print("Mean\t%_above_20") +mean_str = map(lambda u: "%.1f" % u, mean_cvgs) +above_str = map(lambda u: "%.1f" % u, pct_above_20) +print(reduce( lambda u,v: u + "\n" + v, map(lambda idx: mean_str[idx]+"\t"+above_str[idx],range(len(mean_str)))))