From 562db45fa500bafa7050bb6e31674821de7c4fba Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 28 Dec 2009 20:19:37 +0000 Subject: [PATCH] Sites that were marked NO_DINUC no longer get dinuc-corrected but are still recalibrated using the other available covariates. Solid cycle is now the same as Illumina cycle pending an analysis that looks at the effect of PrimerRoundCovariate. Solid color space methods cleaned up to reduce number of calls to read.getAttribute(). Polished NHashMap sort method in preparation for move to core/utils. Added additional plots in AnalyzeCovariates to look at reported quality as a function of the covariate. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2451 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_residualError_OtherCovariate.R | 26 ++++++- R/plot_residualError_QualityScoreCovariate.R | 4 +- .../recalibration/CovariateCounterWalker.java | 11 ++- .../walkers/recalibration/CycleCovariate.java | 39 +++++----- .../walkers/recalibration/DinucCovariate.java | 32 +++++--- .../gatk/walkers/recalibration/NHashMap.java | 55 ++++++++------ .../recalibration/RecalDataManager.java | 75 +++++++++++++------ .../TableRecalibrationWalker.java | 2 +- .../RecalibrationWalkersIntegrationTest.java | 18 ++--- 9 files changed, 166 insertions(+), 96 deletions(-) diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R index 9da7dbdf9..1b626d3a5 100644 --- a/R/plot_residualError_OtherCovariate.R +++ b/R/plot_residualError_OtherCovariate.R @@ -20,9 +20,9 @@ d.good <- c[c$nBases >= 1000,] d.1000 <- c[c$nBases < 1000,] rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) -theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) if( length(d.good$nBases) == length(c$nBases) ) { - theTitle = paste("RMSE = ", round(rmseAll,digits=3)) + theTitle = paste("RMSE =", round(rmseAll,digits=3)) } # Don't let residual error go off the edge of the plot d.good$residualError = d.good$Qempirical-d.good$Qreported @@ -47,6 +47,28 @@ if( is.numeric(c$Covariate) ) { } dev.off() + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + # # Plot histogram of the covariate # diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R index 92fe439d0..06f03b8c8 100644 --- a/R/plot_residualError_QualityScoreCovariate.R +++ b/R/plot_residualError_QualityScoreCovariate.R @@ -20,9 +20,9 @@ f <- t[t$Qreported < Qcutoff,] e <- rbind(d.good, d.1000, d.10000) rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) -theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { - theTitle = paste("RMSE = ", round(rmseAll,digits=3)); + theTitle = paste("RMSE =", round(rmseAll,digits=3)); } plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score") points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 1ce05679c..95d595945 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -101,9 +101,9 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base - private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case + private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.1.2"; // Major version, minor version, and build number + private static final String versionString = "v2.1.3"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -261,7 +261,7 @@ public class CovariateCounterWalker extends LocusWalker { byte[] bases; // For each read at this locus - for ( PileupElement p : context.getPileup() ) { + for( PileupElement p : context.getPileup() ) { read = p.getRead(); offset = p.getOffset(); @@ -275,7 +275,6 @@ public class CovariateCounterWalker extends LocusWalker { refBase = (byte)ref.getBase(); // Skip if this base is an 'N' or etc. - // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it @@ -334,7 +333,7 @@ public class CovariateCounterWalker extends LocusWalker { } } - /** + /** * Validate the dbSNP reference mismatch rates. */ private void validateDbsnpMismatchRate() { @@ -463,7 +462,7 @@ public class CovariateCounterWalker extends LocusWalker { if( SORTED_OUTPUT && requestedCovariates.size() == 4 ) { - for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here + for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted() ) { for( Comparable comp : entry.first ) { recalTableStream.print( comp + "," ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index fab76357d..840c2a727 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -38,8 +38,8 @@ import net.sf.samtools.SAMRecord; * The Cycle covariate. * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle - * For example, for the read: AAACCCCGAAATTTTTACT - * the cycle would be 1111111122233333334 + * For example, for the read: AAACCCCGAAATTTTTACTG + * the cycle would be 00000000111222222233 * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ @@ -63,10 +63,10 @@ public class CycleCovariate implements StandardCovariate { int cycle = 0; //----------------------------- - // ILLUMINA + // ILLUMINA and SOLID //----------------------------- - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) ) { + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { cycle = offset; if( read.getReadNegativeStrandFlag() ) { cycle = read.getReadLength() - (offset + 1); @@ -109,17 +109,17 @@ public class CycleCovariate implements StandardCovariate { } //----------------------------- - // SOLID + // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate //----------------------------- - else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { - // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - int pos = offset; - if( read.getReadNegativeStrandFlag() ) { - pos = read.getReadLength() - (offset + 1); - } - cycle = pos / 5; // integer division - } + //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { + // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + // int pos = offset; + // if( read.getReadNegativeStrandFlag() ) { + // pos = read.getReadLength() - (offset + 1); + // } + // cycle = pos / 5; // integer division + //} //----------------------------- // UNRECOGNIZED PLATFORM @@ -127,10 +127,10 @@ public class CycleCovariate implements StandardCovariate { else { // Platform is unrecognized so revert to the default platform but warn the user first if( !warnedUserBadPlatform ) { - if( defaultPlatform != null) { // the user set a default platform + if( defaultPlatform != null) { // The user set a default platform Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to " + defaultPlatform + " definition of machine cycle." ); - } else { // the user did not set a default platform + } else { // The user did not set a default platform Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to Illumina definition of machine cycle. Users may set the default platform using the --default_platform argument." ); defaultPlatform = "Illumina"; @@ -138,12 +138,13 @@ public class CycleCovariate implements StandardCovariate { warnedUserBadPlatform = true; } read.getReadGroup().setPlatform( defaultPlatform ); - return getValue( read, offset ); // a recursive call + return getValue( read, offset ); // A recursive call } - // differentiate between first and second of pair - if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) + // Differentiate between first and second of pair + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { cycle *= -1; + } return cycle; } @@ -155,6 +156,6 @@ public class CycleCovariate implements StandardCovariate { // Used to estimate the amount space required for the full data HashMap public final int estimatedNumberOfBins() { - return 100; + return 80; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index a2a7d79d4..01c9af3fd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -51,13 +51,13 @@ public class DinucCovariate implements StandardCovariate { public void initialize( final RecalibrationArgumentCollection RAC ) { final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; dinucHashMap = new HashMap(); - for(byte byte1 : BASES) { - for(byte byte2: BASES) { + for( byte byte1 : BASES ) { + for( byte byte2: BASES ) { dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow } } - // add the "no dinuc" entry too - dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC); + // Add the "no dinuc" entry too + dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC ); } // Used to pick out the covariate's value from attributes of the read @@ -68,29 +68,39 @@ public class DinucCovariate implements StandardCovariate { byte[] bases = read.getReadBases(); // If this is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { - // no dinuc at the beginning of the read - if( offset == bases.length-1 ) + // No dinuc at the beginning of the read + if( offset == bases.length-1 ) { return NO_DINUC; + } base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) ); + // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) ); } else { - // no dinuc at the beginning of the read - if( offset == 0 ) + // No dinuc at the beginning of the read + if( offset == 0 ) { return NO_DINUC; + } base = bases[offset]; + // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. prevBase = bases[offset - 1]; } - // make sure the previous base is good - if( !BaseUtils.isRegularBase(prevBase) ) + // Make sure the previous base is good + if( !BaseUtils.isRegularBase( prevBase ) ) { return NO_DINUC; + } return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { - return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); + Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); + if( returnDinuc.compareTo(NO_DINUC) == 0 ) { + return null; + } + return returnDinuc; + } // Used to estimate the amount space required for the full data HashMap diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java index 8ce436838..7d48d90d5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java @@ -41,7 +41,7 @@ import java.util.*; public class NHashMap extends HashMap, T> { - private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse + private static final long serialVersionUID = 1L; // Added by Eclipse private ArrayList> keyLists; public NHashMap() { @@ -78,10 +78,7 @@ public class NHashMap extends HashMap, T> { return super.put( key, value ); } - // This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator. - // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. - @SuppressWarnings(value = "unchecked") - public ArrayList, T>> entrySetSorted4() { + public ArrayList, T>> entrySetSorted() { ArrayList, T>> theSet = new ArrayList, T>>(); @@ -89,35 +86,49 @@ public class NHashMap extends HashMap, T> { Collections.sort(list); } - if( keyLists.size() != 4 ) { - throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()"); + int[] keyIndex = new int[ keyLists.size() ]; + int[] maxIndex = new int[ keyLists.size() ]; + for( int iii = 0; iii < keyLists.size(); iii++ ) { + keyIndex[iii] = 0; + maxIndex[iii] = keyLists.get(iii).size(); } + // Try all the possible keys in sorted order, add them to the output set if they are in the hashMap + boolean triedAllKeys = false; ArrayList newKey = null; - for( Comparable c0 : keyLists.get(0) ) { - for( Comparable c1 : keyLists.get(1) ) { - for( Comparable c2 : keyLists.get(2) ) { - for( Comparable c3 : keyLists.get(3) ) { - newKey = new ArrayList(); - newKey.add(c0); - newKey.add(c1); - newKey.add(c2); - newKey.add(c3); - T value = this.get( newKey ); - if( value!= null ) { - theSet.add(new Pair,T>( newKey, value ) ); - } + while( !triedAllKeys ) { + newKey = new ArrayList(); + for( int iii = 0; iii < keyLists.size(); iii++ ) { + newKey.add(keyLists.get(iii).get(keyIndex[iii])); + } + T value = this.get( newKey ); + if( value!= null ) { + theSet.add(new Pair,T>( newKey, value ) ); + } + + // Increment the keyIndex + keyIndex[keyLists.size() - 1]++; + for( int iii = keyLists.size() - 1; iii >= 0; iii-- ) { + if( keyIndex[iii] >= maxIndex[iii] ) { // Carry it forward + keyIndex[iii] = 0; + if( iii > 0 ) { + keyIndex[iii-1]++; + } else { + triedAllKeys = true; + break; } + } else { + break; } } } - return theSet; } + // Used to make the key from a list of objects public static List makeList(T... args) { List list = new ArrayList(); - for (T arg : args) + for( T arg : args ) { list.add(arg); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index c54e34375..261b3ec1f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -128,12 +128,15 @@ public class RecalDataManager { newKey = new ArrayList(); newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score ... - newKey.add( key.get(iii + 2) ); // and the given covariate - collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); - } else { - collapsedDatum.increment( fullDatum ); + Comparable theCovariateElement = key.get(iii + 2); // and the given covariate + if( theCovariateElement != null ) { + newKey.add( theCovariateElement ); + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment( fullDatum ); + } } } } @@ -218,12 +221,14 @@ public class RecalDataManager { public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { // Check if we need to use the original quality scores instead - if( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if( obj instanceof String ) - read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) ); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + if( RAC.USE_ORIGINAL_QUALS ) { + Object attr = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if( attr != null ) { + if( attr instanceof String ) { + read.setBaseQualities( QualityUtils.fastqToPhred((String)attr) ); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } } } @@ -267,8 +272,16 @@ public class RecalDataManager { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read - if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { - char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + char[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).toCharArray(); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); if( read.getReadNegativeStrandFlag() ) { @@ -305,8 +318,15 @@ public class RecalDataManager { */ public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { - char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + char[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).toCharArray(); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); byte[] colorImpliedBases = readBases.clone(); @@ -334,7 +354,7 @@ public class RecalDataManager { boolean setBaseN = true; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBases, isMappedToReference, coinFlip); + solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip); } } else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag @@ -349,7 +369,7 @@ public class RecalDataManager { /** * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read RC'd if necessary + * @param readBases The bases in the read which have been RC'd if necessary * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param originalQualScores The array of original quality scores to set to zero if needed * @param refBases The reference which has been RC'd if necessary @@ -360,7 +380,7 @@ public class RecalDataManager { private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores, final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) { - for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { // BUGBUG: This relies on the fact that in SOLID bams the first and last base are set to Q0 and aren't recalibrated anyway + for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { if( inconsistency[iii] == 1 ) { if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) { originalQualScores[iii] = (byte)0; @@ -387,7 +407,7 @@ public class RecalDataManager { /** * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read RC'd if necessary + * @param readBases The bases in the read which have been RC'd if necessary * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param colorImpliedBases The bases implied by the color space, RC'd if necessary * @param refBases The reference which has been RC'd if necessary @@ -396,8 +416,15 @@ public class RecalDataManager { */ private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases, final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) { - byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpaceQuals; + if( attr instanceof String ) { + colorSpaceQuals = QualityUtils.fastqToPhred((String)attr); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } for( int iii = 1; iii < inconsistency.length - 2; iii++ ) { if( inconsistency[iii] == 1 ) { @@ -413,8 +440,8 @@ public class RecalDataManager { int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); int diffInQuality = maxQuality - minQuality; int numLow = minQuality; - if(numLow == 0) { numLow = 1; } - int numHigh = Math.round(numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely + if( numLow == 0 ) { numLow = 1; } + int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely int rand = coinFlip.nextInt( numLow + numHigh ); if( rand >= numLow ) { if( maxQuality == (int)colorSpaceQuals[jjj] ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index b61e04a7a..8f662a9dc 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "337ea30c4dcc2fe6a9adc442ffd0706b"); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "60e227ea8c3409fa85b92cae7ea6574f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibrator1() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "f7749792ffffbb86aec66e92a3bddf7f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "f1780e3c3e12f07527e0468149312f10"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "c54a67a1687a4139a8ae19762217987f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "d9ddbacdafc621d830a1db637973d795" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCF() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "3ee0e811682c0f29951128204765ece9"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "f49bc79225bffbf8b64590b65a4b4305" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "62413fdbfe99cd6e24992de4234de5bc" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();