diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index 0dccab54f..9f1b99046 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -36,6 +36,8 @@ import net.sf.samtools.SAMRecord; */ public interface Covariate { + public static final String COVARIATE_ERROR = "COVARIATE_ERROR"; + public static final String COVARIATE_NULL = "COVARITATE_NULL"; public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases public Comparable getValue(String str); // used to get value from input file public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index 404ce4959..a4c50e15d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -56,7 +56,7 @@ import net.sf.samtools.SAMRecord; * The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score) * The first lines of the output file give the name of the covariate classes that were used for this calculation. * - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and are automatically added first. + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list. * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. */ @@ -83,6 +83,12 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // used for reporting at the end private long skippedSites = 0; // used for reporting at the end + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + /** * Parse the -cov arguments and create a list of covariates to be used here * Based on the covariates' estimates for initial capacity allocate the data hashmap @@ -167,10 +173,10 @@ public class CovariateCounterWalker extends LocusWalker { /** * For each read at this locus get the various covariate values and increment that location in the map based on * whether or not the base matches the reference at this particular location - * @param tracker the reference metadata tracker - * @param ref the reference context - * @param context the alignment context - * @return returns 1, but this value isn't used in the reduce step + * @param tracker The reference metadata tracker + * @param ref The reference context + * @param context The alignment context + * @return Returns 1, but this value isn't used in the reduce step */ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { @@ -214,9 +220,9 @@ public class CovariateCounterWalker extends LocusWalker { * Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Using the list of covariate values as a key, pick out the RecalDatum and increment, * adding one to the number of observations and potentially one to the number of mismatches - * @param read the read - * @param offset the offset in the read for this locus - * @param ref the reference context + * @param read The read + * @param offset The offset in the read for this locus + * @param ref The reference context */ private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) { @@ -227,7 +233,7 @@ public class CovariateCounterWalker extends LocusWalker { // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { keyElement = covariate.getValue( read, offset, ref.getBases() ); - if( keyElement != null ) { + if( keyElement != Covariate.COVARIATE_ERROR && keyElement != Covariate.COVARIATE_NULL) { key.add( keyElement ); } else { badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N' @@ -280,7 +286,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker. - * @return returns a PrintStream created from the -rf filename + * @return returns A PrintStream created from the -rf filename */ public PrintStream reduceInit() { try { @@ -292,9 +298,9 @@ public class CovariateCounterWalker extends LocusWalker { /** * The Reduce method doesn't do anything for this walker. - * @param value result of the map. - * @param recalTableStream the PrintStream - * @return returns the PrintStream used to output the CSV data + * @param value Result of the map. This value is immediately ignored. + * @param recalTableStream The PrintStream used to output the CSV data + * @return returns The PrintStream used to output the CSV data */ public PrintStream reduce( Integer value, PrintStream recalTableStream ) { return recalTableStream; // nothing to do here @@ -302,7 +308,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * Write out the full data hashmap to disk in CSV format - * @param recalTableStream the PrintStream to write out to + * @param recalTableStream The PrintStream to write out to */ public void onTraversalDone( PrintStream recalTableStream ) { out.print( "Writing raw recalibration data..." ); @@ -314,7 +320,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format - * @param recalTableStream the PrintStream to write out to + * @param recalTableStream The PrintStream to write out to */ private void outputToCSV( PrintStream recalTableStream ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 9090403db..3d6909ddc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -46,18 +46,19 @@ public class DinucCovariate implements Covariate { public Comparable getValue(SAMRecord read, int offset, char[] refBases) { byte[] bases = read.getReadBases(); char base = (char)bases[offset]; - char prevBase; // set below + char prevBase; // value set below + // If it is a negative strand read then we need to get the complement base and reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { base = BaseUtils.simpleComplement(base); - if ( offset + 1 > bases.length - 1 ) { return null; } // no prevBase because at the beginning of the read + if ( offset + 1 > bases.length - 1 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] ); } else { - if ( offset - 1 < 0 ) { return null; } // no prevBase because at the beginning of the read + if ( offset - 1 < 0 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read prevBase = (char)bases[offset - 1]; } // Check if bad base, probably an 'N' if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) { - return null; // CovariateCounterWalker will recognize that null means skip this particular location in the read + return Covariate.COVARIATE_NULL; // CovariateCounterWalker will recognize that null means skip this particular location in the read } else { return String.format("%c%c", prevBase, base); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index fe17aad2e..1b215a2be 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -42,6 +42,15 @@ import java.io.FileNotFoundException; * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 + * + * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. + + * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. + * This walker then outputs a new bam file with these updated (recalibrated) reads. + * + * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. + * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. */ @WalkerName("TableRecalibrationRefactored") @@ -64,9 +73,9 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; @@ -75,6 +84,17 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); @@ -164,6 +189,18 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); + boolean badKey = false; + Comparable keyElement; for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( read, iii, refBases ) ); // BUGBUG offset should be 1 based but old recalibrator is 0 based? - } - - if( MODE == RecalibrationMode.COMBINATORIAL ) { - RecalDatum datum = dataManager.data.get( key ); - if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing - recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + keyElement = covariate.getValue( read, iii, refBases ); + if( keyElement != Covariate.COVARIATE_ERROR ) { // COVARIATE_NULL is okay because the sequential calculation will do the best it can with the valid data it has + key.add( keyElement ); // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator + } else { + badKey = true; } - } else if( MODE == RecalibrationMode.SEQUENTIAL ) { - recalQuals[iii] = performSequentialQualityCalculation( key ); - } else { - throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); } - // Do some error checking on the new quality score - if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); + if( !badKey ) { + if( MODE == RecalibrationMode.COMBINATORIAL ) { + RecalDatum datum = dataManager.data.get( key ); + if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + } + } else if( MODE == RecalibrationMode.SEQUENTIAL ) { + recalQuals[iii] = performSequentialQualityCalculation( key ); + } else { + throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); + } + + // Do some error checking on the new quality score + if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { + throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); + } } } @@ -211,6 +257,21 @@ public class TableRecalibrationWalker extends ReadWalker key ) { byte qualFromRead = Byte.parseByte(key.get(1).toString()); @@ -252,7 +313,18 @@ public class TableRecalibrationWalker extends ReadWalker %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + if( newQualityByte <= 0 && newQualityByte >= QualityUtils.MAX_REASONABLE_Q_SCORE ) { throw new StingException( "Illegal base quality score calculated: " + key + String.format( " => %d + %.2f + %.2f + %.2f = %d", qualFromRead, globalDeltaQ, deltaQReported, deltaQCovariates, newQualityByte ) ); @@ -261,6 +333,11 @@ public class TableRecalibrationWalker extends ReadWalker