gatk-3.8/python/generate_per_sample_metrics.py

#
# Reads in selected Picard metrics, generating an R-compatible TSV suitable for pre-QC analysis.
#
# To run:
#   /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \
#     -J-classpath $STING_HOME/dist/sam-1.47.869.jar:$STING_HOME/dist/picard-1.47.869.jar:$STING_HOME/dist/picard-private-parts-1941.jar \
#     $STING_HOME/python/generate_per_sample_metrics.py <bam.list> > <output_metrics_file.tsv>
#
# To add a new metric:
#   - If the metric file is new to Picard, add the relevant parser to the picard-private jar
#     (see http://www.broadinstitute.org/gsa/wiki/index.php/Adding_and_updating_dependencies for details).
#   - Add the field name to the header array.
#   - Add the field data to the statement printing the data array.
#
from java.lang import *
from java.io import File,FileReader

from edu.mit.broad.picard.genotype.concordance import DbSnpMatchMetrics
from net.sf.picard.analysis import AlignmentSummaryMetrics,InsertSizeMetrics
from net.sf.picard.analysis.directed import HsMetrics
from net.sf.picard.metrics import MetricsFile

import os,string,sys

def median(l):
    return sorted(l)[(len(l)+1)/2]
def mean(l):
    return float(sum(l))/len(l)

def get_all_metrics(filename):
    if not os.path.exists(filename):
        return None
    file_reader = FileReader(filename)
    metrics_file = MetricsFile()
    metrics_file.read(file_reader)
    metrics = metrics_file.getMetrics()
    file_reader.close()
    return metrics

def get_sample_summary_metrics_fields(type):
    return [field.getName() for field in type.getFields() if not field.getName().startswith('__')]

def get_sample_summary_metrics(filename):
    if not os.path.exists(filename):
        return None
    file_reader = FileReader(filename)
    metrics_file = MetricsFile()
    metrics_file.read(file_reader)
    metrics = metrics_file.getMetrics()[0]
    file_reader.close()
    return metrics

if len(sys.argv) != 2:
    print 'USAGE: %s <pipeline_file.yaml>'
    sys.exit(1)
if not os.path.exists(sys.argv[1]):
    print 'BAM list %s not found' % sys.argv[1]
    sys.exit(1)

bam_list_filename = sys.argv[1]

sample_summary_metrics_types = [ (HsMetrics,'hybrid_selection_metrics'),
                                 (AlignmentSummaryMetrics, 'alignment_summary_metrics'),
                                 (InsertSizeMetrics, 'insert_size_metrics'),
                                 (DbSnpMatchMetrics, 'dbsnp_matches') ]

header = ['sample','SUM_OF_FINGERPRINT_LODS','HAPLOTYPES_CONFIDENTLY_MATCHING.MIN','HAPLOTYPES_CONFIDENTLY_MATCHING.MAX','HAPLOTYPES_CONFIDENTLY_MATCHING.MEDIAN']
for metric_type in sample_summary_metrics_types:
    header.extend(get_sample_summary_metrics_fields(metric_type[0]))
print string.join(header,'\t')

# get a representative BAM file for each sample, to use as a base path.  Note that this assumes every sample corresponds to the same base path.
bam_list = open(bam_list_filename,'r')
samples = dict()

for bam_filename in bam_list:
    bam_filename = bam_filename.strip()
    if bam_filename == '':
        continue
    bam_filename_tokens = bam_filename.split('/')
    sample_id = bam_filename_tokens[len(bam_filename_tokens)-3]
    samples[sample_id] = bam_filename
bam_list.close()

for sample_id,filename in samples.items():
    basepath = filename[:filename.rindex('.bam')]
    
    fingerprinting_summary_metrics = get_all_metrics('%s.%s' % (basepath,'fingerprinting_summary_metrics'))

    if fingerprinting_summary_metrics != None:
        haplotypes_confidently_matching = [metric.HAPLOTYPES_CONFIDENTLY_MATCHING for metric in fingerprinting_summary_metrics]
        sum_of_fingerprint_lods = str(sum([metric.LOD_EXPECTED_SAMPLE for metric in fingerprinting_summary_metrics]))
        min_haplotypes_confidently_matching = str(min(haplotypes_confidently_matching))
        max_haplotypes_confidently_matching = str(max(haplotypes_confidently_matching))
        median_haplotypes_confidently_matching = str(median(haplotypes_confidently_matching))
    else:
        sum_of_fingerprint_lods = 'NA'
        min_haplotypes_confidently_matching = 'NA'
        max_haplotypes_confidently_matching = 'NA'
        median_haplotypes_confidently_matching = 'NA'

    data = [sample_id,sum_of_fingerprint_lods,min_haplotypes_confidently_matching,max_haplotypes_confidently_matching,median_haplotypes_confidently_matching]

    for metrics_type,metrics_extension in sample_summary_metrics_types:
        metrics = get_sample_summary_metrics('%s.%s' % (basepath,metrics_extension))
        data.extend([getattr(metrics, metrics_field_name) for metrics_field_name in get_sample_summary_metrics_fields(metrics_type)])
    print string.join(['%s']*len(header),'\t')%tuple(data)
Add some very simple documentation on running and modifying the per-sample metrics generator. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5995 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 23:22:17 +08:00			`#`
			`# Reads in selected Picard metrics, generating an R-compatible TSV suitable for pre-QC analysis.`
			`#`
			`# To run:`
			`# /humgen/gsa-hpprojects/software/bin/jython2.5.2/jython \`
Rev Picard to get new PF_INDEL_RATE metric. Rev preQC generator script to incorporate PF_INDEL_RATE. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5998 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-15 01:43:43 +08:00			`# -J-classpath $STING_HOME/dist/sam-1.47.869.jar:$STING_HOME/dist/picard-1.47.869.jar:$STING_HOME/dist/picard-private-parts-1941.jar \`
Add some very simple documentation on running and modifying the per-sample metrics generator. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5995 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 23:22:17 +08:00			`# $STING_HOME/python/generate_per_sample_metrics.py <bam.list> > <output_metrics_file.tsv>`
			`#`
			`# To add a new metric:`
			`# - If the metric file is new to Picard, add the relevant parser to the picard-private jar`
			`# (see http://www.broadinstitute.org/gsa/wiki/index.php/Adding_and_updating_dependencies for details).`
			`# - Add the field name to the header array.`
			`# - Add the field data to the statement printing the data array.`
			`#`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`from java.lang import *`
			`from java.io import File,FileReader`
Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00
			`from edu.mit.broad.picard.genotype.concordance import DbSnpMatchMetrics`
			`from net.sf.picard.analysis import AlignmentSummaryMetrics,InsertSizeMetrics`
			`from net.sf.picard.analysis.directed import HsMetrics`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`from net.sf.picard.metrics import MetricsFile`

			`import os,string,sys`

			`def median(l):`
			`return sorted(l)[(len(l)+1)/2]`
			`def mean(l):`
			`return float(sum(l))/len(l)`

Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`def get_all_metrics(filename):`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`if not os.path.exists(filename):`
			`return None`
			`file_reader = FileReader(filename)`
			`metrics_file = MetricsFile()`
			`metrics_file.read(file_reader)`
			`metrics = metrics_file.getMetrics()`
			`file_reader.close()`
			`return metrics`

Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`def get_sample_summary_metrics_fields(type):`
			`return [field.getName() for field in type.getFields() if not field.getName().startswith('__')]`

			`def get_sample_summary_metrics(filename):`
			`if not os.path.exists(filename):`
			`return None`
			`file_reader = FileReader(filename)`
			`metrics_file = MetricsFile()`
			`metrics_file.read(file_reader)`
			`metrics = metrics_file.getMetrics()[0]`
			`file_reader.close()`
			`return metrics`

First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`if len(sys.argv) != 2:`
			`print 'USAGE: %s <pipeline_file.yaml>'`
			`sys.exit(1)`
			`if not os.path.exists(sys.argv[1]):`
Updated input to accept BAM list, and output to emit proper sample name. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5991 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 08:15:01 +08:00			`print 'BAM list %s not found' % sys.argv[1]`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`sys.exit(1)`

Updated input to accept BAM list, and output to emit proper sample name. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5991 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 08:15:01 +08:00			`bam_list_filename = sys.argv[1]`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`sample_summary_metrics_types = [ (HsMetrics,'hybrid_selection_metrics'),`
			`(AlignmentSummaryMetrics, 'alignment_summary_metrics'),`
			`(InsertSizeMetrics, 'insert_size_metrics'),`
			`(DbSnpMatchMetrics, 'dbsnp_matches') ]`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
Add a column summing the fingerprint LOD scores. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6041 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:54:09 +08:00			`header = ['sample','SUM_OF_FINGERPRINT_LODS','HAPLOTYPES_CONFIDENTLY_MATCHING.MIN','HAPLOTYPES_CONFIDENTLY_MATCHING.MAX','HAPLOTYPES_CONFIDENTLY_MATCHING.MEDIAN']`
Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`for metric_type in sample_summary_metrics_types:`
			`header.extend(get_sample_summary_metrics_fields(metric_type[0]))`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`print string.join(header,'\t')`

			`# get a representative BAM file for each sample, to use as a base path. Note that this assumes every sample corresponds to the same base path.`
Updated input to accept BAM list, and output to emit proper sample name. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5991 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 08:15:01 +08:00			`bam_list = open(bam_list_filename,'r')`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`samples = dict()`
Updated input to accept BAM list, and output to emit proper sample name. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5991 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 08:15:01 +08:00
			`for bam_filename in bam_list:`
			`bam_filename = bam_filename.strip()`
			`if bam_filename == '':`
			`continue`
			`bam_filename_tokens = bam_filename.split('/')`
			`sample_id = bam_filename_tokens[len(bam_filename_tokens)-3]`
			`samples[sample_id] = bam_filename`
			`bam_list.close()`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
			`for sample_id,filename in samples.items():`
Updated input to accept BAM list, and output to emit proper sample name. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5991 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 08:15:01 +08:00			`basepath = filename[:filename.rindex('.bam')]`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`fingerprinting_summary_metrics = get_all_metrics('%s.%s' % (basepath,'fingerprinting_summary_metrics'))`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
			`if fingerprinting_summary_metrics != None:`
			`haplotypes_confidently_matching = [metric.HAPLOTYPES_CONFIDENTLY_MATCHING for metric in fingerprinting_summary_metrics]`
Add a column summing the fingerprint LOD scores. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6041 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:54:09 +08:00			`sum_of_fingerprint_lods = str(sum([metric.LOD_EXPECTED_SAMPLE for metric in fingerprinting_summary_metrics]))`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00			`min_haplotypes_confidently_matching = str(min(haplotypes_confidently_matching))`
			`max_haplotypes_confidently_matching = str(max(haplotypes_confidently_matching))`
			`median_haplotypes_confidently_matching = str(median(haplotypes_confidently_matching))`
			`else:`
Add a column summing the fingerprint LOD scores. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6041 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:54:09 +08:00			`sum_of_fingerprint_lods = 'NA'`
If data is not available, use R-compatible 'NA' string. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5987 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:53:38 +08:00			`min_haplotypes_confidently_matching = 'NA'`
			`max_haplotypes_confidently_matching = 'NA'`
			`median_haplotypes_confidently_matching = 'NA'`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
Add a column summing the fingerprint LOD scores. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6041 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:54:09 +08:00			`data = [sample_id,sum_of_fingerprint_lods,min_haplotypes_confidently_matching,max_haplotypes_confidently_matching,median_haplotypes_confidently_matching]`
First pass at a script that generates per-sample metrics from a pipeline yaml input file. Output is an R-parseable tsv. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5984 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-14 02:38:21 +08:00
Partial rewrite of the summary metrics aggregator to accumulate all metrics from sample-level summaries, rather than only specific metrics. Continues to manually handle fingerprinting. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6038 348d0f76-0448-11de-a6fe-93d51630548a 2011-06-23 06:53:53 +08:00			`for metrics_type,metrics_extension in sample_summary_metrics_types:`
			`metrics = get_sample_summary_metrics('%s.%s' % (basepath,metrics_extension))`
			`data.extend([getattr(metrics, metrics_field_name) for metrics_field_name in get_sample_summary_metrics_fields(metrics_type)])`
			`print string.join(['%s']*len(header),'\t')%tuple(data)`