diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index c8d39da7f..3449baafd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker { private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.2.1"; // Major version, minor version, and build number + private static final String versionString = "v2.2.2"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -157,7 +157,6 @@ public class CovariateCounterWalker extends LocusWalker { Utils.warnUser("This calculation is critically dependent on being able to skip over known variant sites. Are you sure you want to be running without a dbSNP rod specified?"); } - // Initialize the requested covariates by parsing the -cov argument requestedCovariates = new ArrayList(); // First add the required covariates @@ -324,7 +323,6 @@ public class CovariateCounterWalker extends LocusWalker { } counts.second++; } - } } @@ -368,9 +366,9 @@ public class CovariateCounterWalker extends LocusWalker { } // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap - RecalDatum datum = (RecalDatum) dataManager.data.get( key ); + RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it - datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method + datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method dataManager.data.put( datum, (Object[])key ); } @@ -469,14 +467,14 @@ public class CovariateCounterWalker extends LocusWalker { for( Comparable comp : keyList ) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps + if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps // For each Covariate in the key for( Object compToPrint : key ) { // Output the Covariate's value recalTableStream.print( compToPrint + "," ); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatum)val).outputToCSV() ); + recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); } else { // Another layer in the nested hash map printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val); } @@ -487,14 +485,14 @@ public class CovariateCounterWalker extends LocusWalker { for( Object comp : data.keySet() ) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps + if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps // For each Covariate in the key for( Object compToPrint : key ) { // Output the Covariate's value recalTableStream.print( compToPrint + "," ); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatum)val).outputToCSV() ); + recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); } else { // Another layer in the nested hash map printMappings( recalTableStream, curPos + 1, key, (Map) val); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java new file mode 100755 index 000000000..c12ca0339 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDatumOptimized.java @@ -0,0 +1,139 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.*; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Jan 6, 2010 + * + * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed. + * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. + */ + +public class RecalDatumOptimized { + + private long numObservations; // number of bases seen in total + private long numMismatches; // number of bases seen that didn't match the reference + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + public RecalDatumOptimized() { + numObservations = 0L; + numMismatches = 0L; + } + + public RecalDatumOptimized( final long _numObservations, final long _numMismatches) { + numObservations = _numObservations; + numMismatches = _numMismatches; + } + + public RecalDatumOptimized( final RecalDatumOptimized copy ) { + this.numObservations = copy.numObservations; + this.numMismatches = copy.numMismatches; + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + + public final void increment( final long incObservations, final long incMismatches ) { + numObservations += incObservations; + numMismatches += incMismatches; + } + + public final void increment( final RecalDatumOptimized other ) { + increment( other.numObservations, other.numMismatches ); + } + + public final void increment( final List data ) { + for ( RecalDatumOptimized other : data ) { + this.increment( other ); + } + } + + public final void increment( final char curBase, final char refBase ) { + increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches + } + + //--------------------------------------------------------------------------------------------------------------- + // + // methods to derive empirical quality score + // + //--------------------------------------------------------------------------------------------------------------- + + public final double empiricalQualDouble( final int smoothing ) { + final double doubleMismatches = (double) ( numMismatches + smoothing ); + final double doubleObservations = (double) ( numObservations + smoothing ); + double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) { empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; } + return empiricalQual; + } + public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero + + public final byte empiricalQualByte( final int smoothing ) { + final double doubleMismatches = (double) ( numMismatches + smoothing ); + final double doubleObservations = (double) ( numObservations + smoothing ); + return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); + } + public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero + + //--------------------------------------------------------------------------------------------------------------- + // + // misc. methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final String outputToCSV( ) { + return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); + } + public final String outputToCSV( final int smoothing ) { + return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) ); + } + + public final long getNumObservations() { + return numObservations; + } + + public final long getNumMismatches() { + return numMismatches; + } + + public String toString() { + return String.format( "RecalDatum: %d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index ecac772d9..02fbc1eb3 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -92,7 +92,7 @@ public class TableRecalibrationWalker extends ReadWalker