diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index f7cc093b3..4332a6def 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -110,7 +110,7 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private final String versionNumber = "2.0.3"; // Major version, minor version, and build number + private final String versionNumber = "2.0.4"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -320,7 +320,7 @@ public class CovariateCounterWalker extends LocusWalker { Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name."); warnUserNullReadGroup = true; } - readGroupId = ReadGroupCovariate.collapsedReadGroup; + readGroupId = ReadGroupCovariate.collapsedReadGroupCode; platform = "ILLUMINA"; } else { readGroupId = readGroup.getReadGroupId(); @@ -332,7 +332,7 @@ public class CovariateCounterWalker extends LocusWalker { platform = "ILLUMINA"; } if( IGNORE_READGROUP ) { - readGroupId = ReadGroupCovariate.collapsedReadGroup; + readGroupId = ReadGroupCovariate.collapsedReadGroupCode; } readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length ); @@ -348,11 +348,11 @@ public class CovariateCounterWalker extends LocusWalker { if( readDatum.quals[offset] > 0 ) { refBase = (byte)ref.getBase(); - prevBase = readDatum.bases[offset-1]; + prevBase = readDatum.bases[offset - 1]; // Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed if( readDatum.isNegStrand ) { - prevBase = readDatum.bases[offset+1]; + prevBase = readDatum.bases[offset + 1]; } // Skip if this base or the previous one was an 'N' or etc. diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index aaef9f960..55f0643c5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -35,7 +35,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class ReadGroupCovariate implements Covariate{ - public static final String collapsedReadGroup = "Read Group Collapsed"; + public static final String collapsedReadGroupCode = "Read Group Collapsed"; public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 5ca812ff9..57dbf902b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -38,26 +38,26 @@ import java.util.*; */ public class RecalDataManager { - public NHashMap data; // the full dataset - private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed + public NHashMap data; // The full dataset + private NHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + private NHashMap dataSumExpectedErrors; // Table used to calculate the overall aggregate reported quality score in which everything except read group is collapsed - private NHashMap dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed - private NHashMap dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed - private ArrayList> dataCollapsedByCovariateDouble; // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + private NHashMap dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariateDouble; // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + public NHashMap aggregateReportedQuality; // Table of the overall aggregate reported quality score in which everything except read group is collapsed - - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores - public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams RecalDataManager() { data = new NHashMap(); } RecalDataManager( final int estimatedCapacity, final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // initialize all the collapsed tables + if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker dataCollapsedReadGroup = new NHashMap(); dataCollapsedQualityScore = new NHashMap(); dataCollapsedByCovariate = new ArrayList>(); @@ -65,13 +65,14 @@ public class RecalDataManager { dataCollapsedByCovariate.add( new NHashMap() ); } dataSumExpectedErrors = new NHashMap(); + aggregateReportedQuality = new NHashMap(); } else { data = new NHashMap( estimatedCapacity, 0.8f); } } RecalDataManager( final int estimatedCapacity ) { - data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', + data = new NHashMap( estimatedCapacity, 0.8f ); // Second arg is the 'loading factor', // a number to monkey around with when optimizing performace of the HashMap } @@ -85,9 +86,9 @@ public class RecalDataManager { // The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around //data.put(key, thisDatum); // add the mapping to the main table - // create dataCollapsedReadGroup, the table where everything except read group has been collapsed + // Create dataCollapsedReadGroup, the table where everything except read group has been collapsed ArrayList newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group + newKey.add( key.get(0) ); // Make a new key with just the read group RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey ); if( collapsedDatum == null ) { dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) ); @@ -95,9 +96,9 @@ public class RecalDataManager { collapsedDatum.increment(fullDatum); } - // create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed + // Create dataSumExpectedErrors, the table used to calculate the overall aggregate quality score in which everything except read group is collapsed newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with just the read group + newKey.add( key.get(0) ); // Make a new key with just the read group Double sumExpectedErrors = dataSumExpectedErrors.get( newKey ); if( sumExpectedErrors == null ) { dataSumExpectedErrors.put( newKey, 0.0 ); @@ -108,8 +109,8 @@ public class RecalDataManager { } newKey = new ArrayList(); - // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed - newKey.add( key.get(0) ); // make a new key with the read group ... + // Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed + newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score collapsedDatum = dataCollapsedQualityScore.get( newKey ); if( collapsedDatum == null ) { @@ -118,10 +119,10 @@ public class RecalDataManager { collapsedDatum.increment(fullDatum); } - // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed - for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { // readGroup and QualityScore aren't counted + // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed + for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { newKey = new ArrayList(); - newKey.add( key.get(0) ); // make a new key with the read group ... + newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score ... newKey.add( key.get(iii + 2) ); // and the given covariate collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); @@ -151,24 +152,30 @@ public class RecalDataManager { // Hash the empirical quality scores so we don't have to call Math.log at every base for every read // Looping over the entrySet is really expensive but worth it for( Map.Entry,RecalDatum> entry : dataCollapsedReadGroup.entrySet() ) { - dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedReadGroupDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); + aggregateReportedQuality.put( entry.getKey(), // sum of the expected errors divided by number of observations turned into a Q score + QualityUtils.phredScaleErrorRate( dataSumExpectedErrors.get(entry.getKey()) / ((double)entry.getValue().getNumObservations()) ) ); } for( Map.Entry,RecalDatum> entry : dataCollapsedQualityScore.entrySet() ) { - dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedQualityScoreDouble.put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); } for( int iii = 0; iii < numCovariates - 2; iii++ ) { for( Map.Entry,RecalDatum> entry : dataCollapsedByCovariate.get(iii).entrySet() ) { - dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing )); + dataCollapsedByCovariateDouble.get(iii).put( entry.getKey(), entry.getValue().empiricalQualDouble( smoothing ) ); } } + dataSumExpectedErrors.clear(); + dataCollapsedReadGroup.clear(); dataCollapsedQualityScore.clear(); dataCollapsedByCovariate.clear(); - dataCollapsedQualityScore = null; // will never need this again - dataCollapsedByCovariate = null; // will never need this again - if( data!=null ) { + dataSumExpectedErrors = null; // Will never need this table again + dataCollapsedReadGroup = null; // Will never need this table again + dataCollapsedQualityScore = null; // Will never need this table again + dataCollapsedByCovariate = null; // Will never need this table again + if( data != null ) { data.clear(); - data = null; // will never need this again + data = null; // Will never need this table again } } @@ -179,11 +186,11 @@ public class RecalDataManager { */ public final NHashMap getCollapsedTable( final int covariate ) { if( covariate == 0) { - return dataCollapsedReadGroup; // table where everything except read group has been collapsed + return dataCollapsedReadGroup; // Table where everything except read group has been collapsed } else if( covariate == 1 ) { - return dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed + return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed } else { - return dataCollapsedByCovariate.get( covariate - 2 ); // table where everything except read group, quality score, and given covariate has been collapsed + return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed } } @@ -194,11 +201,11 @@ public class RecalDataManager { */ public final NHashMap getCollapsedDoubleTable( final int covariate ) { if( covariate == 0) { - return dataCollapsedReadGroupDouble; // table of empirical qualities where everything except read group has been collapsed + return dataCollapsedReadGroupDouble; // Table of empirical qualities where everything except read group has been collapsed } else if( covariate == 1 ) { - return dataCollapsedQualityScoreDouble; // table of empirical qualities where everything except read group and quality score has been collapsed + return dataCollapsedQualityScoreDouble; // Table of empirical qualities where everything except read group and quality score has been collapsed } else { - return dataCollapsedByCovariateDouble.get( covariate - 2 ); // table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed + return dataCollapsedByCovariateDouble.get( covariate - 2 ); // Table of empirical qualities where everything except read group, quality score, and given covariate has been collapsed } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 71c0f1c5e..9cd60c2a1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,9 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.*; +import net.sf.picard.reference.ReferenceSequenceFileFactory; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -103,7 +101,7 @@ public class TableRecalibrationWalker extends ReadWalker