Bug fixes for both SET_Q_ZERO and REMOVE_REF_BIAS solid recal modes regarding proper handling of negative strand reads. These changes yield a minor improvment in HapMap sensitivity.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2548 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-08 15:19:22 +00:00
parent dfcd5ce25b
commit 9cbae53ee1
4 changed files with 56 additions and 53 deletions

View File

@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -311,7 +311,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @param context The AlignmentContext which holds the reads covered by this locus
* @param ref The reference base
*/
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
private static void updateMismatchCounts(final Pair<Long, Long> counts, final AlignmentContext context, final char ref) {
for( PileupElement p : context.getPileup() ) {
final char readChar = (char)(p.getBase());
final int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
@ -377,7 +377,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
final long curMismatches = datum.getNumMismatches();
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char then the bytes default to the (long, long) version of the increment method which is really bad
countedBases++;
novel_counts.second++;
novel_counts.first += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable

View File

@ -260,11 +260,11 @@ public class RecalDataManager {
if( read.getReadNegativeStrandFlag() ) {
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
}
byte[] inconsistency = new byte[readBases.length];
final byte[] inconsistency = new byte[readBases.length];
int iii;
byte prevBase = (byte) colorSpace[0]; // The sentinel
for( iii = 0; iii < readBases.length; iii++ ) {
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
final byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 );
prevBase = readBases[iii];
}
@ -289,9 +289,9 @@ public class RecalDataManager {
* @param refBases The reference for this read
* @return A new array of quality scores that have been ref bias corrected
*/
public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if( attr != null ) {
char[] colorSpace;
if( attr instanceof String ) {
@ -308,16 +308,16 @@ public class RecalDataManager {
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
refBasesDirRead = BaseUtils.simpleReverseComplement( refBases );
}
int[] inconsistency = new int[readBases.length];
final int[] inconsistency = new int[readBases.length];
byte prevBase = (byte) colorSpace[0]; // The sentinel
for( int iii = 0; iii < readBases.length; iii++ ) {
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
final byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
colorImpliedBases[iii] = thisBase;
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
prevBase = readBases[iii];
}
boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
final boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { // Set inconsistent bases and the one before it to Q0
@ -328,7 +328,7 @@ public class RecalDataManager {
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip);
}
}
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName());
@ -350,27 +350,26 @@ public class RecalDataManager {
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
* @return The byte array of original quality scores some of which might have been set to zero
*/
private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores,
private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores,
final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) {
final boolean negStrand = read.getReadNegativeStrandFlag();
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) {
if( inconsistency[iii] == 1 ) {
if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) {
originalQualScores[iii] = (byte)0;
if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; }
else { originalQualScores[iii] = (byte)0; }
if( setBaseN ) { readBases[iii] = (byte)'N'; }
}
// Set the prev base to Q0 as well
if( !isMappedToRef || (char)readBases[iii-1] == refBases[iii-1] ) {
originalQualScores[iii-1] = (byte)0;
if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; }
else { originalQualScores[iii-1] = (byte)0; }
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
}
//if( !isMappedToRef || (char)readBases[iii+1] == refBases[iii+1] ) {
// originalQualScores[iii+1] = (byte)0;
// if( setBaseN ) { readBases[iii+1] = (byte)'N'; }
//}
}
}
if( read.getReadNegativeStrandFlag() ) {
if( negStrand ) {
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
}
read.setReadBases( readBases );
@ -387,10 +386,10 @@ public class RecalDataManager {
* @param isMappedToRef Is this read mapped fully to a reference
* @param coinFlip A random number generator
*/
private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases,
final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) {
private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases,
final char[] refBases, final boolean isMappedToRef, final Random coinFlip) {
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if( attr != null ) {
byte[] colorSpaceQuals;
if( attr instanceof String ) {
@ -402,27 +401,32 @@ public class RecalDataManager {
for( int iii = 1; iii < inconsistency.length - 1; iii++ ) {
if( inconsistency[iii] == 1 ) {
for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
int rand = coinFlip.nextInt( 2 );
if( rand == 0 ) { // The color implied base won the coin flip
readBases[jjj] = colorImpliedBases[jjj];
}
} else {
int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
int diffInQuality = maxQuality - minQuality;
int numLow = minQuality;
if( numLow == 0 ) { numLow = 1; }
int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
int rand = coinFlip.nextInt( numLow + numHigh );
if( rand >= numLow ) {
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
final int rand = coinFlip.nextInt( 2 );
if( rand == 0 ) { // The color implied base won the coin flip
readBases[jjj] = colorImpliedBases[jjj];
}
} else {
if( minQuality == (int)colorSpaceQuals[jjj] ) {
readBases[jjj] = colorImpliedBases[jjj];
final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
int diffInQuality = maxQuality - minQuality;
int numLow = minQuality;
if( numLow == 0 ) {
numLow++;
diffInQuality++;
}
final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
final int rand = coinFlip.nextInt( numLow + numHigh );
if( rand >= numLow ) { // higher q score won
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had higher q score, and won out, so nothing to do here
} else { // lower q score won
if( minQuality == (int)colorSpaceQuals[jjj] ) {
readBases[jjj] = colorImpliedBases[jjj];
} // else ref color had lower q score, and won out, so nothing to do here
}
}
}
@ -457,7 +461,7 @@ public class RecalDataManager {
case '3': // simple complement
return BaseUtils.simpleComplement( prevBase );
default:
throw new StingException( "Unrecognized color space in SOLID bam, color = " + color );
throw new StingException( "Unrecognized color space in SOLID read, color = " + color );
}
}
@ -468,9 +472,9 @@ public class RecalDataManager {
* @return Returns true if the base is inconsistent with the color space
*/
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
if( attr != null ) {
byte[] colorSpace = (byte[])attr;
final byte[] colorSpace = (byte[])attr;
return colorSpace[offset] != 0;
} else {
return false;

View File

@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.2.5"; // Major version, minor version, and build number
private static final String versionString = "v2.2.6"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
private static final long RANDOM_SEED = 1032861495;
@ -175,7 +175,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
}
}
}
} else { // Found a line of data
@ -259,7 +258,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
* @param line A line of CSV data read from the recalibration table data file
*/
private void addCSVData(String line) {
private void addCSVData(final String line) {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
@ -302,7 +301,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte[] originalQuals = read.getBaseQualities();
final byte[] recalQuals = originalQuals.clone();
String platform = read.getReadGroup().getPlatform();
final String platform = read.getReadGroup().getPlatform();
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
}
@ -351,7 +350,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation(Object... key ) {
private byte performSequentialQualityCalculation(final Object... key ) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
@ -413,7 +412,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
recalQuals[iii] = originalQuals[iii];

View File

@ -50,9 +50,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
public void testTableRecalibrator1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "6c59d291c37d053e0f188b762f3060a5" );
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "27b3eaf3c02ffc5fb3d7815468d9958e");
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "0f4a882d78747380af2822317c7b6045");
e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "7ebdce416b72679e1cf88cc9886a5edc" );
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "a39afc94ed74f8137c9d43285997bd90" );
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "41b0313bfcaf5afb3cb85f9830e79769" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorMaxQ70() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "bf8d62beda7d3cf38d660d7939901192" );
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "8120b648a43b293b84a7cb6374513fb8" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -165,7 +165,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorNoReadGroups() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "ad345fcfb2faaf97eb0291ffa61b3228" );
e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d9d29f38f8ea31ac267f9d937cad9291" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();