Eliminated unnecessary boxing of longs in RecalDatum. Changes to RecalDatum in preparation for new AnalyzeCovariates script. Updated TableRecalibrationWalker to make use of these changes.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2199 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-12-01 16:49:05 +00:00
parent c93d37d9fb
commit 3180fffd43
4 changed files with 39 additions and 39 deletions

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
@ -47,12 +48,10 @@ public class RecalDataManager {
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
private NHashMap<Double> dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed
private NHashMap<Double> dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
private NHashMap<Double> dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
public NHashMap<Double> aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
@ -70,8 +69,6 @@ public class RecalDataManager {
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
}
dataSumExpectedErrors = new NHashMap<Double>();
aggregateReportedQuality = new NHashMap<Double>();
} else {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
}
@ -100,19 +97,7 @@ public class RecalDataManager {
if( collapsedDatum == null ) {
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
}
// Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // Make a new key with just the read group
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
if( sumExpectedErrors == null ) {
dataSumExpectedErrors.put( newKey, 0.0 );
} else {
dataSumExpectedErrors.remove( newKey );
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
}
newKey = new ArrayList<Comparable>();
@ -123,7 +108,7 @@ public class RecalDataManager {
if( collapsedDatum == null ) {
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
collapsedDatum.increment( fullDatum );
}
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
@ -136,7 +121,7 @@ public class RecalDataManager {
if( collapsedDatum == null ) {
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
} else {
collapsedDatum.increment(fullDatum);
collapsedDatum.increment( fullDatum );
}
}
}
@ -160,8 +145,6 @@ public class RecalDataManager {
// Looping over the entrySet is really expensive but worth it
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score
QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) );
}
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
@ -172,15 +155,11 @@ public class RecalDataManager {
}
}
dataSumExpectedErrors.clear();
dataCollapsedReadGroup.clear();
dataCollapsedQualityScore.clear();
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
dataCollapsedByCovariate.get(iii).clear();
}
dataCollapsedByCovariate.clear();
dataSumExpectedErrors = null; // Will never need this table again
dataCollapsedReadGroup = null; // Will never need this table again
dataCollapsedQualityScore = null; // Will never need this table again
dataCollapsedByCovariate = null; // Will never need this table again
if( data != null ) {
@ -211,7 +190,7 @@ public class RecalDataManager {
*/
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
if( covariate == 0) {
return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
return dataCollapsedReadGroupDouble;
} else if( covariate == 1 ) {
return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
} else {

View File

@ -42,7 +42,7 @@ public class RecalDatum {
private long numObservations; // number of bases seen in total
private long numMismatches; // number of bases seen that didn't match the reference
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
//---------------------------------------------------------------------------------------------------------------
//
@ -52,16 +52,19 @@ public class RecalDatum {
public RecalDatum() {
numObservations = 0L;
numMismatches = 0L;
estimatedQReported = 0.0;
}
public RecalDatum( final long _numObservations, final long _numMismatches ) {
public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported ) {
numObservations = _numObservations;
numMismatches = _numMismatches;
estimatedQReported = _estimatedQReported;
}
public RecalDatum( final RecalDatum copy ) {
this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches;
this.estimatedQReported = copy.estimatedQReported;
}
//---------------------------------------------------------------------------------------------------------------
@ -80,6 +83,13 @@ public class RecalDatum {
increment( other.numObservations, other.numMismatches );
}
public final void combine( final RecalDatum other ) {
double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
this.increment( other.numObservations, other.numMismatches );
this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations);
if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
}
public final void increment( final List<RecalDatum> data ) {
for ( RecalDatum other : data ) {
this.increment( other );
@ -126,14 +136,26 @@ public class RecalDatum {
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
}
public final Long getNumObservations() {
public final long getNumObservations() {
return numObservations;
}
public final Long getNumMismatches() {
public final long getNumMismatches() {
return numMismatches;
}
public final double getEstimatedQReported() {
return estimatedQReported;
}
private double calcExpectedErrors() {
return (double)this.numObservations * qualToErrorProb( estimatedQReported );
}
private double qualToErrorProb( final double qual ) {
return Math.pow(10.0, qual / -10.0);
}
public String toString() {
return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
}

View File

@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.0.8"; // Major version, minor version, and build number
private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
//---------------------------------------------------------------------------------------------------------------
@ -278,7 +278,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
}
// Create a new datum using the number of observations and number of mismatches
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum );
@ -366,11 +366,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// The global quality shift (over the read group only)
collapsedTableKey.add( readGroupKeyElement );
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
Double aggregrateQreported = dataManager.aggregateReportedQuality.get( collapsedTableKey );
double globalDeltaQ = 0.0;
if( globalDeltaQEmpirical != null && aggregrateQreported != null ) {
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
if( globalDeltaQEmpirical != null ) {
Double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
}
// The shift in quality between reported and empirical

View File

@ -50,8 +50,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "94f2c602fef9800e270326be9faab3ad");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "eea2e02ffe71c9a1f318d2e9cd31a103" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8126b9494837d504ef1277a799267c15" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "320ff4cbf8f1e73ba2d6e93567382684" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e9a7ac8bc7c6b3daaf7f80730046318d" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();