Eliminated unnecessary boxing of longs in RecalDatum. Changes to RecalDatum in preparation for new AnalyzeCovariates script. Updated TableRecalibrationWalker to make use of these changes.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2199 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c93d37d9fb
commit
3180fffd43
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -47,12 +48,10 @@ public class RecalDataManager {
|
|||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
private NHashMap<Double> dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed
|
||||
|
||||
private NHashMap<Double> dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
|
||||
private NHashMap<Double> dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<Double>> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed
|
||||
public NHashMap<Double> aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
|
|
@ -70,8 +69,6 @@ public class RecalDataManager {
|
|||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
dataSumExpectedErrors = new NHashMap<Double>();
|
||||
aggregateReportedQuality = new NHashMap<Double>();
|
||||
} else {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.8f);
|
||||
}
|
||||
|
|
@ -100,19 +97,7 @@ public class RecalDataManager {
|
|||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
Double sumExpectedErrors = dataSumExpectedErrors.get( newKey );
|
||||
if( sumExpectedErrors == null ) {
|
||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||
} else {
|
||||
dataSumExpectedErrors.remove( newKey );
|
||||
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * fullDatum.getNumObservations();
|
||||
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
|
||||
newKey = new ArrayList<Comparable>();
|
||||
|
|
@ -123,7 +108,7 @@ public class RecalDataManager {
|
|||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
|
@ -136,7 +121,7 @@ public class RecalDataManager {
|
|||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -160,8 +145,6 @@ public class RecalDataManager {
|
|||
// Looping over the entrySet is really expensive but worth it
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) {
|
||||
dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score
|
||||
QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) );
|
||||
}
|
||||
for( Map.Entry<List<? extends Comparable>,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) {
|
||||
dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) );
|
||||
|
|
@ -172,15 +155,11 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
dataSumExpectedErrors.clear();
|
||||
dataCollapsedReadGroup.clear();
|
||||
dataCollapsedQualityScore.clear();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) {
|
||||
dataCollapsedByCovariate.get(iii).clear();
|
||||
}
|
||||
dataCollapsedByCovariate.clear();
|
||||
dataSumExpectedErrors = null; // Will never need this table again
|
||||
dataCollapsedReadGroup = null; // Will never need this table again
|
||||
dataCollapsedQualityScore = null; // Will never need this table again
|
||||
dataCollapsedByCovariate = null; // Will never need this table again
|
||||
if( data != null ) {
|
||||
|
|
@ -211,7 +190,7 @@ public class RecalDataManager {
|
|||
*/
|
||||
public final NHashMap<Double> getCollapsedDoubleTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed
|
||||
return dataCollapsedReadGroupDouble;
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ public class RecalDatum {
|
|||
|
||||
private long numObservations; // number of bases seen in total
|
||||
private long numMismatches; // number of bases seen that didn't match the reference
|
||||
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -52,16 +52,19 @@ public class RecalDatum {
|
|||
public RecalDatum() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
estimatedQReported = 0.0;
|
||||
}
|
||||
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches ) {
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported ) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
estimatedQReported = _estimatedQReported;
|
||||
}
|
||||
|
||||
public RecalDatum( final RecalDatum copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
this.estimatedQReported = copy.estimatedQReported;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -80,6 +83,13 @@ public class RecalDatum {
|
|||
increment( other.numObservations, other.numMismatches );
|
||||
}
|
||||
|
||||
public final void combine( final RecalDatum other ) {
|
||||
double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
this.increment( other.numObservations, other.numMismatches );
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations);
|
||||
if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||
}
|
||||
|
||||
public final void increment( final List<RecalDatum> data ) {
|
||||
for ( RecalDatum other : data ) {
|
||||
this.increment( other );
|
||||
|
|
@ -126,14 +136,26 @@ public class RecalDatum {
|
|||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
|
||||
}
|
||||
|
||||
public final Long getNumObservations() {
|
||||
public final long getNumObservations() {
|
||||
return numObservations;
|
||||
}
|
||||
|
||||
public final Long getNumMismatches() {
|
||||
public final long getNumMismatches() {
|
||||
return numMismatches;
|
||||
}
|
||||
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double)this.numObservations * qualToErrorProb( estimatedQReported );
|
||||
}
|
||||
|
||||
private double qualToErrorProb( final double qual ) {
|
||||
return Math.pow(10.0, qual / -10.0);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.0.8"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.0.9"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -278,7 +278,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
}
|
||||
// Create a new datum using the number of observations and number of mismatches
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum );
|
||||
|
||||
|
|
@ -366,11 +366,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// The global quality shift (over the read group only)
|
||||
collapsedTableKey.add( readGroupKeyElement );
|
||||
Double globalDeltaQEmpirical = dataManager.getCollapsedDoubleTable(0).get( collapsedTableKey );
|
||||
Double aggregrateQreported = dataManager.aggregateReportedQuality.get( collapsedTableKey );
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalDeltaQEmpirical != null && aggregrateQreported != null ) {
|
||||
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQreported;
|
||||
if( globalDeltaQEmpirical != null ) {
|
||||
Double aggregrateQReported = dataManager.getCollapsedTable(0).get( collapsedTableKey ).getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
|
|
|
|||
|
|
@ -50,8 +50,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "94f2c602fef9800e270326be9faab3ad");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "eea2e02ffe71c9a1f318d2e9cd31a103" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "8126b9494837d504ef1277a799267c15" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "320ff4cbf8f1e73ba2d6e93567382684" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e9a7ac8bc7c6b3daaf7f80730046318d" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue