From a13cbe1df079753590932469e6f657b21cdfd7d7 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Tue, 10 Nov 2009 22:20:06 +0000 Subject: [PATCH] The refactored recalibrator now passes the integration tests as well as my own validation tests. I'm ready to have other people start jamming on the files. I'll make an updated wiki page soon. The refactored recalibrator is currently a bit slower than the old one but there were a lot of great, easy ideas today for how to improve it. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2013 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/Recalibration/Covariate.java | 2 + .../Recalibration/CovariateCounterWalker.java | 36 +++-- .../walkers/Recalibration/DinucCovariate.java | 9 +- .../TableRecalibrationWalker.java | 133 +++++++++++++++--- 4 files changed, 141 insertions(+), 39 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index 0dccab54f..9f1b99046 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -36,6 +36,8 @@ import net.sf.samtools.SAMRecord; */ public interface Covariate { + public static final String COVARIATE_ERROR = "COVARIATE_ERROR"; + public static final String COVARIATE_NULL = "COVARITATE_NULL"; public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases public Comparable getValue(String str); // used to get value from input file public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index 404ce4959..a4c50e15d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -56,7 +56,7 @@ import net.sf.samtools.SAMRecord; * The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score) * The first lines of the output file give the name of the covariate classes that were used for this calculation. * - * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and are automatically added first. + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and must be at the start of the list. * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. */ @@ -83,6 +83,12 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // used for reporting at the end private long skippedSites = 0; // used for reporting at the end + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + /** * Parse the -cov arguments and create a list of covariates to be used here * Based on the covariates' estimates for initial capacity allocate the data hashmap @@ -167,10 +173,10 @@ public class CovariateCounterWalker extends LocusWalker { /** * For each read at this locus get the various covariate values and increment that location in the map based on * whether or not the base matches the reference at this particular location - * @param tracker the reference metadata tracker - * @param ref the reference context - * @param context the alignment context - * @return returns 1, but this value isn't used in the reduce step + * @param tracker The reference metadata tracker + * @param ref The reference context + * @param context The alignment context + * @return Returns 1, but this value isn't used in the reduce step */ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { @@ -214,9 +220,9 @@ public class CovariateCounterWalker extends LocusWalker { * Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Using the list of covariate values as a key, pick out the RecalDatum and increment, * adding one to the number of observations and potentially one to the number of mismatches - * @param read the read - * @param offset the offset in the read for this locus - * @param ref the reference context + * @param read The read + * @param offset The offset in the read for this locus + * @param ref The reference context */ private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) { @@ -227,7 +233,7 @@ public class CovariateCounterWalker extends LocusWalker { // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { keyElement = covariate.getValue( read, offset, ref.getBases() ); - if( keyElement != null ) { + if( keyElement != Covariate.COVARIATE_ERROR && keyElement != Covariate.COVARIATE_NULL) { key.add( keyElement ); } else { badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N' @@ -280,7 +286,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker. - * @return returns a PrintStream created from the -rf filename + * @return returns A PrintStream created from the -rf filename */ public PrintStream reduceInit() { try { @@ -292,9 +298,9 @@ public class CovariateCounterWalker extends LocusWalker { /** * The Reduce method doesn't do anything for this walker. - * @param value result of the map. - * @param recalTableStream the PrintStream - * @return returns the PrintStream used to output the CSV data + * @param value Result of the map. This value is immediately ignored. + * @param recalTableStream The PrintStream used to output the CSV data + * @return returns The PrintStream used to output the CSV data */ public PrintStream reduce( Integer value, PrintStream recalTableStream ) { return recalTableStream; // nothing to do here @@ -302,7 +308,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * Write out the full data hashmap to disk in CSV format - * @param recalTableStream the PrintStream to write out to + * @param recalTableStream The PrintStream to write out to */ public void onTraversalDone( PrintStream recalTableStream ) { out.print( "Writing raw recalibration data..." ); @@ -314,7 +320,7 @@ public class CovariateCounterWalker extends LocusWalker { /** * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format - * @param recalTableStream the PrintStream to write out to + * @param recalTableStream The PrintStream to write out to */ private void outputToCSV( PrintStream recalTableStream ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 9090403db..3d6909ddc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -46,18 +46,19 @@ public class DinucCovariate implements Covariate { public Comparable getValue(SAMRecord read, int offset, char[] refBases) { byte[] bases = read.getReadBases(); char base = (char)bases[offset]; - char prevBase; // set below + char prevBase; // value set below + // If it is a negative strand read then we need to get the complement base and reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { base = BaseUtils.simpleComplement(base); - if ( offset + 1 > bases.length - 1 ) { return null; } // no prevBase because at the beginning of the read + if ( offset + 1 > bases.length - 1 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] ); } else { - if ( offset - 1 < 0 ) { return null; } // no prevBase because at the beginning of the read + if ( offset - 1 < 0 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read prevBase = (char)bases[offset - 1]; } // Check if bad base, probably an 'N' if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) { - return null; // CovariateCounterWalker will recognize that null means skip this particular location in the read + return Covariate.COVARIATE_NULL; // CovariateCounterWalker will recognize that null means skip this particular location in the read } else { return String.format("%c%c", prevBase, base); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index fe17aad2e..1b215a2be 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -42,6 +42,15 @@ import java.io.FileNotFoundException; * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 + * + * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. + + * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. + * This walker then outputs a new bam file with these updated (recalibrated) reads. + * + * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. + * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. */ @WalkerName("TableRecalibrationRefactored") @@ -64,9 +73,9 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; @@ -75,6 +84,17 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); @@ -164,6 +189,18 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); + boolean badKey = false; + Comparable keyElement; for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( read, iii, refBases ) ); // BUGBUG offset should be 1 based but old recalibrator is 0 based? - } - - if( MODE == RecalibrationMode.COMBINATORIAL ) { - RecalDatum datum = dataManager.data.get( key ); - if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing - recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + keyElement = covariate.getValue( read, iii, refBases ); + if( keyElement != Covariate.COVARIATE_ERROR ) { // COVARIATE_NULL is okay because the sequential calculation will do the best it can with the valid data it has + key.add( keyElement ); // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator + } else { + badKey = true; } - } else if( MODE == RecalibrationMode.SEQUENTIAL ) { - recalQuals[iii] = performSequentialQualityCalculation( key ); - } else { - throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); } - // Do some error checking on the new quality score - if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); + if( !badKey ) { + if( MODE == RecalibrationMode.COMBINATORIAL ) { + RecalDatum datum = dataManager.data.get( key ); + if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); + } + } else if( MODE == RecalibrationMode.SEQUENTIAL ) { + recalQuals[iii] = performSequentialQualityCalculation( key ); + } else { + throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); + } + + // Do some error checking on the new quality score + if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { + throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); + } } } @@ -211,6 +257,21 @@ public class TableRecalibrationWalker extends ReadWalker key ) { byte qualFromRead = Byte.parseByte(key.get(1).toString()); @@ -252,7 +313,18 @@ public class TableRecalibrationWalker extends ReadWalker %d + %.2f + %.2f + %.2f + %.2f = %d", + // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); + //} + //else { + // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", + // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); + //} + if( newQualityByte <= 0 && newQualityByte >= QualityUtils.MAX_REASONABLE_Q_SCORE ) { throw new StingException( "Illegal base quality score calculated: " + key + String.format( " => %d + %.2f + %.2f + %.2f = %d", qualFromRead, globalDeltaQ, deltaQReported, deltaQCovariates, newQualityByte ) ); @@ -261,6 +333,11 @@ public class TableRecalibrationWalker extends ReadWalker