2011-06-14 23:22:17 +08:00
|
|
|
#
|
|
|
|
|
# Reads in selected Picard metrics, generating an R-compatible TSV suitable for pre-QC analysis.
|
|
|
|
|
#
|
|
|
|
|
# To run:
|
|
|
|
|
# /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \
|
2011-06-15 01:43:43 +08:00
|
|
|
# -J-classpath $STING_HOME/dist/sam-1.47.869.jar:$STING_HOME/dist/picard-1.47.869.jar:$STING_HOME/dist/picard-private-parts-1941.jar \
|
2011-06-14 23:22:17 +08:00
|
|
|
# $STING_HOME/python/generate_per_sample_metrics.py <bam.list> > <output_metrics_file.tsv>
|
|
|
|
|
#
|
|
|
|
|
# To add a new metric:
|
|
|
|
|
# - If the metric file is new to Picard, add the relevant parser to the picard-private jar
|
|
|
|
|
# (see http://www.broadinstitute.org/gsa/wiki/index.php/Adding_and_updating_dependencies for details).
|
|
|
|
|
# - Add the field name to the header array.
|
|
|
|
|
# - Add the field data to the statement printing the data array.
|
|
|
|
|
#
|
2011-06-14 02:38:21 +08:00
|
|
|
from java.lang import *
|
|
|
|
|
from java.io import File,FileReader
|
2011-06-23 06:53:53 +08:00
|
|
|
|
|
|
|
|
from edu.mit.broad.picard.genotype.concordance import DbSnpMatchMetrics
|
|
|
|
|
from net.sf.picard.analysis import AlignmentSummaryMetrics,InsertSizeMetrics
|
|
|
|
|
from net.sf.picard.analysis.directed import HsMetrics
|
2011-06-14 02:38:21 +08:00
|
|
|
from net.sf.picard.metrics import MetricsFile
|
|
|
|
|
|
|
|
|
|
import os,string,sys
|
|
|
|
|
|
|
|
|
|
def median(l):
|
|
|
|
|
return sorted(l)[(len(l)+1)/2]
|
|
|
|
|
def mean(l):
|
|
|
|
|
return float(sum(l))/len(l)
|
|
|
|
|
|
2011-06-23 06:53:53 +08:00
|
|
|
def get_all_metrics(filename):
|
2011-06-14 02:38:21 +08:00
|
|
|
if not os.path.exists(filename):
|
|
|
|
|
return None
|
|
|
|
|
file_reader = FileReader(filename)
|
|
|
|
|
metrics_file = MetricsFile()
|
|
|
|
|
metrics_file.read(file_reader)
|
|
|
|
|
metrics = metrics_file.getMetrics()
|
|
|
|
|
file_reader.close()
|
|
|
|
|
return metrics
|
|
|
|
|
|
2011-06-23 06:53:53 +08:00
|
|
|
def get_sample_summary_metrics_fields(type):
|
|
|
|
|
return [field.getName() for field in type.getFields() if not field.getName().startswith('__')]
|
|
|
|
|
|
|
|
|
|
def get_sample_summary_metrics(filename):
|
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
|
return None
|
|
|
|
|
file_reader = FileReader(filename)
|
|
|
|
|
metrics_file = MetricsFile()
|
|
|
|
|
metrics_file.read(file_reader)
|
|
|
|
|
metrics = metrics_file.getMetrics()[0]
|
|
|
|
|
file_reader.close()
|
|
|
|
|
return metrics
|
|
|
|
|
|
2011-06-14 02:38:21 +08:00
|
|
|
if len(sys.argv) != 2:
|
|
|
|
|
print 'USAGE: %s <pipeline_file.yaml>'
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
if not os.path.exists(sys.argv[1]):
|
2011-06-14 08:15:01 +08:00
|
|
|
print 'BAM list %s not found' % sys.argv[1]
|
2011-06-14 02:38:21 +08:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
2011-06-14 08:15:01 +08:00
|
|
|
bam_list_filename = sys.argv[1]
|
2011-06-14 02:38:21 +08:00
|
|
|
|
2011-06-23 06:53:53 +08:00
|
|
|
sample_summary_metrics_types = [ (HsMetrics,'hybrid_selection_metrics'),
|
|
|
|
|
(AlignmentSummaryMetrics, 'alignment_summary_metrics'),
|
|
|
|
|
(InsertSizeMetrics, 'insert_size_metrics'),
|
|
|
|
|
(DbSnpMatchMetrics, 'dbsnp_matches') ]
|
2011-06-14 02:38:21 +08:00
|
|
|
|
2011-06-23 06:54:09 +08:00
|
|
|
header = ['sample','SUM_OF_FINGERPRINT_LODS','HAPLOTYPES_CONFIDENTLY_MATCHING.MIN','HAPLOTYPES_CONFIDENTLY_MATCHING.MAX','HAPLOTYPES_CONFIDENTLY_MATCHING.MEDIAN']
|
2011-06-23 06:53:53 +08:00
|
|
|
for metric_type in sample_summary_metrics_types:
|
|
|
|
|
header.extend(get_sample_summary_metrics_fields(metric_type[0]))
|
2011-06-14 02:38:21 +08:00
|
|
|
print string.join(header,'\t')
|
|
|
|
|
|
|
|
|
|
# get a representative BAM file for each sample, to use as a base path. Note that this assumes every sample corresponds to the same base path.
|
2011-06-14 08:15:01 +08:00
|
|
|
bam_list = open(bam_list_filename,'r')
|
2011-06-14 02:38:21 +08:00
|
|
|
samples = dict()
|
2011-06-14 08:15:01 +08:00
|
|
|
|
|
|
|
|
for bam_filename in bam_list:
|
|
|
|
|
bam_filename = bam_filename.strip()
|
|
|
|
|
if bam_filename == '':
|
|
|
|
|
continue
|
|
|
|
|
bam_filename_tokens = bam_filename.split('/')
|
|
|
|
|
sample_id = bam_filename_tokens[len(bam_filename_tokens)-3]
|
|
|
|
|
samples[sample_id] = bam_filename
|
|
|
|
|
bam_list.close()
|
2011-06-14 02:38:21 +08:00
|
|
|
|
|
|
|
|
for sample_id,filename in samples.items():
|
2011-06-14 08:15:01 +08:00
|
|
|
basepath = filename[:filename.rindex('.bam')]
|
2011-06-14 02:38:21 +08:00
|
|
|
|
2011-06-23 06:53:53 +08:00
|
|
|
fingerprinting_summary_metrics = get_all_metrics('%s.%s' % (basepath,'fingerprinting_summary_metrics'))
|
2011-06-14 02:38:21 +08:00
|
|
|
|
|
|
|
|
if fingerprinting_summary_metrics != None:
|
|
|
|
|
haplotypes_confidently_matching = [metric.HAPLOTYPES_CONFIDENTLY_MATCHING for metric in fingerprinting_summary_metrics]
|
2011-06-23 06:54:09 +08:00
|
|
|
sum_of_fingerprint_lods = str(sum([metric.LOD_EXPECTED_SAMPLE for metric in fingerprinting_summary_metrics]))
|
2011-06-14 02:38:21 +08:00
|
|
|
min_haplotypes_confidently_matching = str(min(haplotypes_confidently_matching))
|
|
|
|
|
max_haplotypes_confidently_matching = str(max(haplotypes_confidently_matching))
|
|
|
|
|
median_haplotypes_confidently_matching = str(median(haplotypes_confidently_matching))
|
|
|
|
|
else:
|
2011-06-23 06:54:09 +08:00
|
|
|
sum_of_fingerprint_lods = 'NA'
|
2011-06-14 02:53:38 +08:00
|
|
|
min_haplotypes_confidently_matching = 'NA'
|
|
|
|
|
max_haplotypes_confidently_matching = 'NA'
|
|
|
|
|
median_haplotypes_confidently_matching = 'NA'
|
2011-06-14 02:38:21 +08:00
|
|
|
|
2011-06-23 06:54:09 +08:00
|
|
|
data = [sample_id,sum_of_fingerprint_lods,min_haplotypes_confidently_matching,max_haplotypes_confidently_matching,median_haplotypes_confidently_matching]
|
2011-06-14 02:38:21 +08:00
|
|
|
|
2011-06-23 06:53:53 +08:00
|
|
|
for metrics_type,metrics_extension in sample_summary_metrics_types:
|
|
|
|
|
metrics = get_sample_summary_metrics('%s.%s' % (basepath,metrics_extension))
|
|
|
|
|
data.extend([getattr(metrics, metrics_field_name) for metrics_field_name in get_sample_summary_metrics_fields(metrics_type)])
|
|
|
|
|
print string.join(['%s']*len(header),'\t')%tuple(data)
|