diff --git a/R/plot_residualError_OtherCovariate.R b/R/plot_residualError_OtherCovariate.R index 9da7dbdf9..1b626d3a5 100644 --- a/R/plot_residualError_OtherCovariate.R +++ b/R/plot_residualError_OtherCovariate.R @@ -20,9 +20,9 @@ d.good <- c[c$nBases >= 1000,] d.1000 <- c[c$nBases < 1000,] rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) -theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) if( length(d.good$nBases) == length(c$nBases) ) { - theTitle = paste("RMSE = ", round(rmseAll,digits=3)) + theTitle = paste("RMSE =", round(rmseAll,digits=3)) } # Don't let residual error go off the edge of the plot d.good$residualError = d.good$Qempirical-d.good$Qreported @@ -47,6 +47,28 @@ if( is.numeric(c$Covariate) ) { } dev.off() + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + # # Plot histogram of the covariate # diff --git a/R/plot_residualError_QualityScoreCovariate.R b/R/plot_residualError_QualityScoreCovariate.R index 92fe439d0..06f03b8c8 100644 --- a/R/plot_residualError_QualityScoreCovariate.R +++ b/R/plot_residualError_QualityScoreCovariate.R @@ -20,9 +20,9 @@ f <- t[t$Qreported < Qcutoff,] e <- rbind(d.good, d.1000, d.10000) rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) -theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3)) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { - theTitle = paste("RMSE = ", round(rmseAll,digits=3)); + theTitle = paste("RMSE =", round(rmseAll,digits=3)); } plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score") points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 1ce05679c..95d595945 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -101,9 +101,9 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base - private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case + private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.1.2"; // Major version, minor version, and build number + private static final String versionString = "v2.1.3"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -261,7 +261,7 @@ public class CovariateCounterWalker extends LocusWalker { byte[] bases; // For each read at this locus - for ( PileupElement p : context.getPileup() ) { + for( PileupElement p : context.getPileup() ) { read = p.getRead(); offset = p.getOffset(); @@ -275,7 +275,6 @@ public class CovariateCounterWalker extends LocusWalker { refBase = (byte)ref.getBase(); // Skip if this base is an 'N' or etc. - // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it @@ -334,7 +333,7 @@ public class CovariateCounterWalker extends LocusWalker { } } - /** + /** * Validate the dbSNP reference mismatch rates. */ private void validateDbsnpMismatchRate() { @@ -463,7 +462,7 @@ public class CovariateCounterWalker extends LocusWalker { if( SORTED_OUTPUT && requestedCovariates.size() == 4 ) { - for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here + for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted() ) { for( Comparable comp : entry.first ) { recalTableStream.print( comp + "," ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index fab76357d..840c2a727 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -38,8 +38,8 @@ import net.sf.samtools.SAMRecord; * The Cycle covariate. * For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read) * For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle - * For example, for the read: AAACCCCGAAATTTTTACT - * the cycle would be 1111111122233333334 + * For example, for the read: AAACCCCGAAATTTTTACTG + * the cycle would be 00000000111222222233 * For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round */ @@ -63,10 +63,10 @@ public class CycleCovariate implements StandardCovariate { int cycle = 0; //----------------------------- - // ILLUMINA + // ILLUMINA and SOLID //----------------------------- - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) ) { + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { cycle = offset; if( read.getReadNegativeStrandFlag() ) { cycle = read.getReadLength() - (offset + 1); @@ -109,17 +109,17 @@ public class CycleCovariate implements StandardCovariate { } //----------------------------- - // SOLID + // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate //----------------------------- - else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { - // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - int pos = offset; - if( read.getReadNegativeStrandFlag() ) { - pos = read.getReadLength() - (offset + 1); - } - cycle = pos / 5; // integer division - } + //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { + // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + // int pos = offset; + // if( read.getReadNegativeStrandFlag() ) { + // pos = read.getReadLength() - (offset + 1); + // } + // cycle = pos / 5; // integer division + //} //----------------------------- // UNRECOGNIZED PLATFORM @@ -127,10 +127,10 @@ public class CycleCovariate implements StandardCovariate { else { // Platform is unrecognized so revert to the default platform but warn the user first if( !warnedUserBadPlatform ) { - if( defaultPlatform != null) { // the user set a default platform + if( defaultPlatform != null) { // The user set a default platform Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to " + defaultPlatform + " definition of machine cycle." ); - } else { // the user did not set a default platform + } else { // The user did not set a default platform Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + "Reverting to Illumina definition of machine cycle. Users may set the default platform using the --default_platform argument." ); defaultPlatform = "Illumina"; @@ -138,12 +138,13 @@ public class CycleCovariate implements StandardCovariate { warnedUserBadPlatform = true; } read.getReadGroup().setPlatform( defaultPlatform ); - return getValue( read, offset ); // a recursive call + return getValue( read, offset ); // A recursive call } - // differentiate between first and second of pair - if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) + // Differentiate between first and second of pair + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { cycle *= -1; + } return cycle; } @@ -155,6 +156,6 @@ public class CycleCovariate implements StandardCovariate { // Used to estimate the amount space required for the full data HashMap public final int estimatedNumberOfBins() { - return 100; + return 80; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index a2a7d79d4..01c9af3fd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -51,13 +51,13 @@ public class DinucCovariate implements StandardCovariate { public void initialize( final RecalibrationArgumentCollection RAC ) { final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; dinucHashMap = new HashMap(); - for(byte byte1 : BASES) { - for(byte byte2: BASES) { + for( byte byte1 : BASES ) { + for( byte byte2: BASES ) { dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow } } - // add the "no dinuc" entry too - dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC); + // Add the "no dinuc" entry too + dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC ); } // Used to pick out the covariate's value from attributes of the read @@ -68,29 +68,39 @@ public class DinucCovariate implements StandardCovariate { byte[] bases = read.getReadBases(); // If this is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { - // no dinuc at the beginning of the read - if( offset == bases.length-1 ) + // No dinuc at the beginning of the read + if( offset == bases.length-1 ) { return NO_DINUC; + } base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) ); + // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) ); } else { - // no dinuc at the beginning of the read - if( offset == 0 ) + // No dinuc at the beginning of the read + if( offset == 0 ) { return NO_DINUC; + } base = bases[offset]; + // Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads. prevBase = bases[offset - 1]; } - // make sure the previous base is good - if( !BaseUtils.isRegularBase(prevBase) ) + // Make sure the previous base is good + if( !BaseUtils.isRegularBase( prevBase ) ) { return NO_DINUC; + } return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { - return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); + Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); + if( returnDinuc.compareTo(NO_DINUC) == 0 ) { + return null; + } + return returnDinuc; + } // Used to estimate the amount space required for the full data HashMap diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java index 8ce436838..7d48d90d5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/NHashMap.java @@ -41,7 +41,7 @@ import java.util.*; public class NHashMap extends HashMap, T> { - private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse + private static final long serialVersionUID = 1L; // Added by Eclipse private ArrayList> keyLists; public NHashMap() { @@ -78,10 +78,7 @@ public class NHashMap extends HashMap, T> { return super.put( key, value ); } - // This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator. - // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. - @SuppressWarnings(value = "unchecked") - public ArrayList, T>> entrySetSorted4() { + public ArrayList, T>> entrySetSorted() { ArrayList, T>> theSet = new ArrayList, T>>(); @@ -89,35 +86,49 @@ public class NHashMap extends HashMap, T> { Collections.sort(list); } - if( keyLists.size() != 4 ) { - throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()"); + int[] keyIndex = new int[ keyLists.size() ]; + int[] maxIndex = new int[ keyLists.size() ]; + for( int iii = 0; iii < keyLists.size(); iii++ ) { + keyIndex[iii] = 0; + maxIndex[iii] = keyLists.get(iii).size(); } + // Try all the possible keys in sorted order, add them to the output set if they are in the hashMap + boolean triedAllKeys = false; ArrayList newKey = null; - for( Comparable c0 : keyLists.get(0) ) { - for( Comparable c1 : keyLists.get(1) ) { - for( Comparable c2 : keyLists.get(2) ) { - for( Comparable c3 : keyLists.get(3) ) { - newKey = new ArrayList(); - newKey.add(c0); - newKey.add(c1); - newKey.add(c2); - newKey.add(c3); - T value = this.get( newKey ); - if( value!= null ) { - theSet.add(new Pair,T>( newKey, value ) ); - } + while( !triedAllKeys ) { + newKey = new ArrayList(); + for( int iii = 0; iii < keyLists.size(); iii++ ) { + newKey.add(keyLists.get(iii).get(keyIndex[iii])); + } + T value = this.get( newKey ); + if( value!= null ) { + theSet.add(new Pair,T>( newKey, value ) ); + } + + // Increment the keyIndex + keyIndex[keyLists.size() - 1]++; + for( int iii = keyLists.size() - 1; iii >= 0; iii-- ) { + if( keyIndex[iii] >= maxIndex[iii] ) { // Carry it forward + keyIndex[iii] = 0; + if( iii > 0 ) { + keyIndex[iii-1]++; + } else { + triedAllKeys = true; + break; } + } else { + break; } } } - return theSet; } + // Used to make the key from a list of objects public static List makeList(T... args) { List list = new ArrayList(); - for (T arg : args) + for( T arg : args ) { list.add(arg); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index c54e34375..261b3ec1f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -128,12 +128,15 @@ public class RecalDataManager { newKey = new ArrayList(); newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(1) ); // and quality score ... - newKey.add( key.get(iii + 2) ); // and the given covariate - collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); - if( collapsedDatum == null ) { - dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); - } else { - collapsedDatum.increment( fullDatum ); + Comparable theCovariateElement = key.get(iii + 2); // and the given covariate + if( theCovariateElement != null ) { + newKey.add( theCovariateElement ); + collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey ); + if( collapsedDatum == null ) { + dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) ); + } else { + collapsedDatum.increment( fullDatum ); + } } } } @@ -218,12 +221,14 @@ public class RecalDataManager { public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { // Check if we need to use the original quality scores instead - if( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if( obj instanceof String ) - read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) ); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + if( RAC.USE_ORIGINAL_QUALS ) { + Object attr = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if( attr != null ) { + if( attr instanceof String ) { + read.setBaseQualities( QualityUtils.fastqToPhred((String)attr) ); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } } } @@ -267,8 +272,16 @@ public class RecalDataManager { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read - if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { - char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + char[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).toCharArray(); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); if( read.getReadNegativeStrandFlag() ) { @@ -305,8 +318,15 @@ public class RecalDataManager { */ public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { - char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if( attr != null ) { + char[] colorSpace; + if( attr instanceof String ) { + colorSpace = ((String)attr).toCharArray(); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + } + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read byte[] readBases = read.getReadBases(); byte[] colorImpliedBases = readBases.clone(); @@ -334,7 +354,7 @@ public class RecalDataManager { boolean setBaseN = true; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases - solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBases, isMappedToReference, coinFlip); + solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip); } } else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag @@ -349,7 +369,7 @@ public class RecalDataManager { /** * Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read RC'd if necessary + * @param readBases The bases in the read which have been RC'd if necessary * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param originalQualScores The array of original quality scores to set to zero if needed * @param refBases The reference which has been RC'd if necessary @@ -360,7 +380,7 @@ public class RecalDataManager { private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores, final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) { - for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { // BUGBUG: This relies on the fact that in SOLID bams the first and last base are set to Q0 and aren't recalibrated anyway + for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { if( inconsistency[iii] == 1 ) { if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) { originalQualScores[iii] = (byte)0; @@ -387,7 +407,7 @@ public class RecalDataManager { /** * Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference * @param read The SAMRecord to recalibrate - * @param readBases The bases in the read RC'd if necessary + * @param readBases The bases in the read which have been RC'd if necessary * @param inconsistency The array of 1/0 that says if this base is inconsistent with its color * @param colorImpliedBases The bases implied by the color space, RC'd if necessary * @param refBases The reference which has been RC'd if necessary @@ -396,8 +416,15 @@ public class RecalDataManager { */ private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases, final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) { - byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); + if( attr != null ) { + byte[] colorSpaceQuals; + if( attr instanceof String ) { + colorSpaceQuals = QualityUtils.fastqToPhred((String)attr); + } else { + throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } for( int iii = 1; iii < inconsistency.length - 2; iii++ ) { if( inconsistency[iii] == 1 ) { @@ -413,8 +440,8 @@ public class RecalDataManager { int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]); int diffInQuality = maxQuality - minQuality; int numLow = minQuality; - if(numLow == 0) { numLow = 1; } - int numHigh = Math.round(numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely + if( numLow == 0 ) { numLow = 1; } + int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely int rand = coinFlip.nextInt( numLow + numHigh ); if( rand >= numLow ) { if( maxQuality == (int)colorSpaceQuals[jjj] ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index b61e04a7a..8f662a9dc 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "337ea30c4dcc2fe6a9adc442ffd0706b"); e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "60e227ea8c3409fa85b92cae7ea6574f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibrator1() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "f7749792ffffbb86aec66e92a3bddf7f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "f1780e3c3e12f07527e0468149312f10"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "c54a67a1687a4139a8ae19762217987f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "d9ddbacdafc621d830a1db637973d795" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCF() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "3ee0e811682c0f29951128204765ece9"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "f49bc79225bffbf8b64590b65a4b4305" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "62413fdbfe99cd6e24992de4234de5bc" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();