First pass at a script that generates per-sample metrics from a pipeline yaml

input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-13 18:38:21 +00:00 · 2011-06-13 18:38:21 +00:00 · 1d1c9da783
parent 44a29680bf
commit 1d1c9da783
1 changed files with 73 additions and 0 deletions
--- a/python/generate_per_sample_metrics.py
+++ b/python/generate_per_sample_metrics.py
@ -0,0 +1,73 @@
+from java.lang import *
+from java.io import File,FileReader
+from net.sf.picard.metrics import MetricsFile
+from org.broadinstitute.sting.datasources.pipeline import Pipeline
+from org.broadinstitute.sting.utils.yaml import YamlUtils
+
+import os,string,sys
+
+def median(l):
+    return sorted(l)[(len(l)+1)/2]
+def mean(l):
+    return float(sum(l))/len(l)
+
+def get_metrics(filename):
+    if not os.path.exists(filename):
+        return None
+    file_reader = FileReader(filename)
+    metrics_file = MetricsFile()
+    metrics_file.read(file_reader)
+    metrics = metrics_file.getMetrics()
+    file_reader.close()
+    return metrics
+
+if len(sys.argv) != 2:
+    print 'USAGE: %s <pipeline_file.yaml>'
+    sys.exit(1)
+if not os.path.exists(sys.argv[1]):
+    print 'Pipeline file %s not found' % sys.argv[1]
+    sys.exit(1)
+
+pipeline_file = sys.argv[1]
+pipeline = YamlUtils.load(Pipeline,File(pipeline_file))
+
+header = ['SAMPLE','HAPLOTYPES_CONFIDENTLY_MATCHING.MIN','HAPLOTYPES_CONFIDENTLY_MATCHING.MAX','HAPLOTYPES_CONFIDENTLY_MATCHING.MEDIAN',
+          'BAIT_SET','GENOME_SIZE','PCT_SELECTED_BASES','MEAN_TARGET_COVERAGE','ZERO_CVG_TARGETS_PCT','FOLD_80_BASE_PENALTY','HS_LIBRARY_SIZE',
+          'PCT_PF_READS_ALIGNED','PF_HQ_ERROR_RATE','MEAN_READ_LENGTH','BAD_CYCLES','STRAND_BALANCE','PCT_CHIMERAS','PCT_ADAPTER','MEDIAN_INSERT_SIZE',
+          'TOTAL_SNPS']
+data = ['%s'] * len(header)
+
+print string.join(header,'\t')
+
+# get a representative BAM file for each sample, to use as a base path.  Note that this assumes every sample corresponds to the same base path.
+samples = dict()
+for sample in pipeline.getSamples():
+    if sample.getBamFiles().size() > 0:
+        samples[sample.getId()] = sample.getBamFiles().values().iterator().next()
+
+for sample_id,filename in samples.items():
+    basepath = filename.getAbsolutePath()[0:filename.getAbsolutePath().rindex('.bam')]
+    
+    fingerprinting_summary_metrics = get_metrics('%s.%s' % (basepath,'fingerprinting_summary_metrics'))
+
+    if fingerprinting_summary_metrics != None:
+        haplotypes_confidently_matching = [metric.HAPLOTYPES_CONFIDENTLY_MATCHING for metric in fingerprinting_summary_metrics]
+        min_haplotypes_confidently_matching = str(min(haplotypes_confidently_matching))
+        max_haplotypes_confidently_matching = str(max(haplotypes_confidently_matching))
+        median_haplotypes_confidently_matching = str(median(haplotypes_confidently_matching))
+    else:
+        min_haplotypes_confidently_matching = 'N/A'
+        max_haplotypes_confidently_matching = 'N/A'
+        median_haplotypes_confidently_matching = 'N/A'
+
+    hybrid_selection_metrics = get_metrics('%s.%s' % (basepath,'hybrid_selection_metrics'))[0]
+    alignment_summary_metrics = get_metrics('%s.%s' % (basepath,'alignment_summary_metrics'))[0]
+    insert_size_metrics = get_metrics('%s.%s' % (basepath,'insert_size_metrics'))[0]
+    dbsnp_matches = get_metrics('%s.%s' % (basepath,'dbsnp_matches'))[0]
+
+    print string.join(data,'\t')%(sample_id,str(min(haplotypes_confidently_matching)),str(max(haplotypes_confidently_matching)),str(median(haplotypes_confidently_matching)),
+                                  hybrid_selection_metrics.BAIT_SET,hybrid_selection_metrics.GENOME_SIZE,hybrid_selection_metrics.PCT_SELECTED_BASES,
+                                  hybrid_selection_metrics.MEAN_TARGET_COVERAGE,hybrid_selection_metrics.ZERO_CVG_TARGETS_PCT,hybrid_selection_metrics.FOLD_80_BASE_PENALTY,
+                                  hybrid_selection_metrics.HS_LIBRARY_SIZE,alignment_summary_metrics.PCT_PF_READS_ALIGNED,alignment_summary_metrics.PF_HQ_ERROR_RATE,
+                                  alignment_summary_metrics.MEAN_READ_LENGTH,alignment_summary_metrics.BAD_CYCLES,alignment_summary_metrics.STRAND_BALANCE,
+                                  alignment_summary_metrics.PCT_CHIMERAS,alignment_summary_metrics.PCT_ADAPTER,insert_size_metrics.MEDIAN_INSERT_SIZE,dbsnp_matches.TOTAL_SNPS)