From 3180fffd433de69b6ce250cf2486289325160d3d Mon Sep 17 00:00:00 2001 From: rpoplin Date: Tue, 1 Dec 2009 16:49:05 +0000 Subject: [PATCH] Eliminated unnecessary boxing of longs in RecalDatum. Changes to RecalDatum in preparation for new AnalyzeCovariates script. Updated TableRecalibrationWalker to make use of these changes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2199 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/RecalDataManager.java | 31 +++--------------- .../walkers/recalibration/RecalDatum.java | 32 ++++++++++++++++--- .../TableRecalibrationWalker.java | 11 +++---- .../RecalibrationWalkersIntegrationTest.java | 4 +-- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 461229ea8..2b804e7c4 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -47,12 +48,10 @@ public class RecalDataManager { private NHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed private NHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed private ArrayList> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed - private NHashMap dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed private NHashMap dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed private NHashMap dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed private ArrayList> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed - public NHashMap aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams @@ -70,8 +69,6 @@ public class RecalDataManager { for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate dataCollapsedByCovariate.add( new NHashMap() ); } - dataSumExpectedErrors = new NHashMap(); - aggregateReportedQuality = new NHashMap(); } else { data = new NHashMap( estimatedCapacity, 0.8f); } @@ -100,19 +97,7 @@ public class RecalDataManager { if( collapsedDatum == null ) { dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); } else { - collapsedDatum.increment(fullDatum); - } - - // Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed - newKey = new ArrayList(); - newKey.add( key.get(0) ); // Make a new key with just the read group - Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); - if( sumExpectedErrors == null ) { - dataSumExpectedErrors.put( newKey, 0.0 ); - } else { - dataSumExpectedErrors.remove( newKey ); - sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations(); - dataSumExpectedErrors.put( newKey, sumExpectedErrors ); + collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported } newKey = new ArrayList(); @@ -123,7 +108,7 @@ public class RecalDataManager { if( collapsedDatum == null ) { dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) ); } else { - collapsedDatum.increment(fullDatum); + collapsedDatum.increment( fullDatum ); } // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed @@ -136,7 +121,7 @@ public class RecalDataManager { if( collapsedDatum == null ) { dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); } else { - collapsedDatum.increment(fullDatum); + collapsedDatum.increment( fullDatum ); } } } @@ -160,8 +145,6 @@ public class RecalDataManager { // Looping over the entrySet is really expensive but worth it for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); - aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score - QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) ); } for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); @@ -172,15 +155,11 @@ public class RecalDataManager { } } - dataSumExpectedErrors.clear(); - dataCollapsedReadGroup.clear(); dataCollapsedQualityScore.clear(); for( int iii = 0; iii < numCovariates - 2; iii++ ) { dataCollapsedByCovariate.get(iii).clear(); } dataCollapsedByCovariate.clear(); - dataSumExpectedErrors = null; // Will never need this table again - dataCollapsedReadGroup = null; // Will never need this table again dataCollapsedQualityScore = null; // Will never need this table again dataCollapsedByCovariate = null; // Will never need this table again if( data != null ) { @@ -211,7 +190,7 @@ public class RecalDataManager { */ public final NHashMap getCollapsedDoubleTable( final int covariate ) { if( covariate == 0) { - return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed + return dataCollapsedReadGroupDouble; } else if( covariate == 1 ) { return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed } else { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java index 28251908c..ab32beb42 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatum.java @@ -42,7 +42,7 @@ public class RecalDatum { private long numObservations; // number of bases seen in total private long numMismatches; // number of bases seen that didn't match the reference - + private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations //--------------------------------------------------------------------------------------------------------------- // @@ -52,16 +52,19 @@ public class RecalDatum { public RecalDatum() { numObservations = 0L; numMismatches = 0L; + estimatedQReported = 0.0; } - public RecalDatum( final long _numObservations, final long _numMismatches ) { + public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported ) { numObservations = _numObservations; numMismatches = _numMismatches; + estimatedQReported = _estimatedQReported; } public RecalDatum( final RecalDatum copy ) { this.numObservations = copy.numObservations; this.numMismatches = copy.numMismatches; + this.estimatedQReported = copy.estimatedQReported; } //--------------------------------------------------------------------------------------------------------------- @@ -80,6 +83,13 @@ public class RecalDatum { increment( other.numObservations, other.numMismatches ); } + public final void combine( final RecalDatum other ) { + double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); + this.increment( other.numObservations, other.numMismatches ); + this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations); + if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; } + } + public final void increment( final List data ) { for ( RecalDatum other : data ) { this.increment( other ); @@ -126,14 +136,26 @@ public class RecalDatum { return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) ); } - public final Long getNumObservations() { + public final long getNumObservations() { return numObservations; } - public final Long getNumMismatches() { + public final long getNumMismatches() { return numMismatches; } - + + public final double getEstimatedQReported() { + return estimatedQReported; + } + + private double calcExpectedErrors() { + return (double)this.numObservations * qualToErrorProb( estimatedQReported ); + } + + private double qualToErrorProb( final double qual ) { + return Math.pow(10.0, qual / -10.0); + } + public String toString() { return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 420d5664b..8702cb85f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" ); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "94f2c602fef9800e270326be9faab3ad"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "eea2e02ffe71c9a1f318d2e9cd31a103" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8126b9494837d504ef1277a799267c15" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "320ff4cbf8f1e73ba2d6e93567382684" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e9a7ac8bc7c6b3daaf7f80730046318d" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();