From 277e6d6b327f99ae093a505a9403bf00640f5baf Mon Sep 17 00:00:00 2001 From: rpoplin Date: Tue, 24 Nov 2009 18:21:57 +0000 Subject: [PATCH] Further optimizations of TableRecalibration. This completes my goal of having the only math done in the map function be addition, subtraction and rounding the quality score to an integer. Everything else has been moved to the initialize method and only done once. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2145 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/CovariateCounterWalker.java | 10 +-- .../recalibration/ReadGroupCovariate.java | 2 +- .../recalibration/RecalDataManager.java | 77 ++++++++++--------- .../TableRecalibrationWalker.java | 37 +++++---- 4 files changed, 71 insertions(+), 55 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index f7cc093b3..4332a6def 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -110,7 +110,7 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private final String versionNumber = "2.0.3"; // Major version, minor version, and build number + private final String versionNumber = "2.0.4"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -320,7 +320,7 @@ public class CovariateCounterWalker extends LocusWalker { Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name."); warnUserNullReadGroup = true; } - readGroupId = ReadGroupCovariate.collapsedReadGroup; + readGroupId = ReadGroupCovariate.collapsedReadGroupCode; platform = "ILLUMINA"; } else { readGroupId = readGroup.getReadGroupId(); @@ -332,7 +332,7 @@ public class CovariateCounterWalker extends LocusWalker { platform = "ILLUMINA"; } if( IGNORE_READGROUP ) { - readGroupId = ReadGroupCovariate.collapsedReadGroup; + readGroupId = ReadGroupCovariate.collapsedReadGroupCode; } readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length ); @@ -348,11 +348,11 @@ public class CovariateCounterWalker extends LocusWalker { if( readDatum.quals[offset] > 0 ) { refBase = (byte)ref.getBase(); - prevBase = readDatum.bases[offset-1]; + prevBase = readDatum.bases[offset - 1]; // Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed if( readDatum.isNegStrand ) { - prevBase = readDatum.bases[offset+1]; + prevBase = readDatum.bases[offset + 1]; } // Skip if this base or the previous one was an 'N' or etc. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index aaef9f960..55f0643c5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -35,7 +35,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class ReadGroupCovariate implements Covariate{ - public static final String collapsedReadGroup = "Read Group Collapsed"; + public static final String collapsedReadGroupCode = "Read Group Collapsed"; public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 5ca812ff9..57dbf902b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -38,26 +38,26 @@ import java.util.*; */ public class RecalDataManager { - public NHashMap data; // the full dataset - private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed + public NHashMap data; // The full dataset + private NHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + private NHashMap dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed - private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + private NHashMap dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + public NHashMap aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed - - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams RecalDataManager() { data = new NHashMap(); } RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // initialize all the collapsed tables + if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker dataCollapsedReadGroup = new NHashMap(); dataCollapsedQualityScore = new NHashMap(); dataCollapsedByCovariate = new ArrayList>(); @@ -65,13 +65,14 @@ public class RecalDataManager { dataCollapsedByCovariate.add( new NHashMap() ); } dataSumExpectedErrors = new NHashMap(); + aggregateReportedQuality = new NHashMap(); } else { data = new NHashMap( estimatedCapacity, 0.8f); } } RecalDataManager( final int estimatedCapacity ) { - data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', + data = new NHashMap( estimatedCapacity, 0.8f ); // Second arg is the 'loading factor', // a number to monkey around with when optimizing performace of the HashMap } @@ -85,9 +86,9 @@ public class RecalDataManager { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table - // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed ArrayList newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group + newKey.add( key.get(0) ); // Make a new key with just the read group RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); if( collapsedDatum == null ) { dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); @@ -95,9 +96,9 @@ public class RecalDataManager { collapsedDatum.increment(fullDatum); } - // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed + // Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group + newKey.add( key.get(0) ); // Make a new key with just the read group Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); if( sumExpectedErrors == null ) { dataSumExpectedErrors.put( newKey, 0.0 ); @@ -108,8 +109,8 @@ public class RecalDataManager { } newKey = new ArrayList(); - // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - newKey.add( key.get(0) ); // make a new key with the read group ... + // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score collapsedDatum = dataCollapsedQualityScore.get( newKey ); if( collapsedDatum == null ) { @@ -118,10 +119,10 @@ public class RecalDataManager { collapsedDatum.increment(fullDatum); } - // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted + // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score ... newKey.add( key.get(iii + 2) ); // and the given covariate collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); @@ -151,24 +152,30 @@ public class RecalDataManager { // Hash the empirical quality scores so we don't have to call Math.log at every base for every read // Looping over the entrySet is really expensive but worth it for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { - dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); + aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score + QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) ); } for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { - dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); } for( int iii = 0; iii < numCovariates - 2; iii++ ) { for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { - dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); } } + dataSumExpectedErrors.clear(); + dataCollapsedReadGroup.clear(); dataCollapsedQualityScore.clear(); dataCollapsedByCovariate.clear(); - dataCollapsedQualityScore = null; // will never need this again - dataCollapsedByCovariate = null; // will never need this again - if( data!=null ) { + dataSumExpectedErrors = null; // Will never need this table again + dataCollapsedReadGroup = null; // Will never need this table again + dataCollapsedQualityScore = null; // Will never need this table again + dataCollapsedByCovariate = null; // Will never need this table again + if( data != null ) { data.clear(); - data = null; // will never need this again + data = null; // Will never need this table again } } @@ -179,11 +186,11 @@ public class RecalDataManager { */ public final NHashMap getCollapsedTable( final int covariate ) { if( covariate == 0) { - return dataCollapsedReadGroup; // table where everything except read group has been collapsed + return dataCollapsedReadGroup; // Table where everything except read group has been collapsed } else if( covariate == 1 ) { - return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed + return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed + return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed } } @@ -194,11 +201,11 @@ public class RecalDataManager { */ public final NHashMap getCollapsedDoubleTable( final int covariate ) { if( covariate == 0) { - return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed } else if( covariate == 1 ) { - return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed } else { - return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 71c0f1c5e..9cd60c2a1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,9 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.*; +import net.sf.picard.reference.ReferenceSequenceFileFactory; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -103,7 +101,7 @@ public class TableRecalibrationWalker extends ReadWalker