Bug fixes for both SET_Q_ZERO and REMOVE_REF_BIAS solid recal modes regarding proper handling of negative strand reads. These changes yield a minor improvment in HapMap sensitivity.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2548 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
dfcd5ce25b
commit
9cbae53ee1
|
|
@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.2.2"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.3"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -311,7 +311,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* @param context The AlignmentContext which holds the reads covered by this locus
|
||||
* @param ref The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
|
||||
private static void updateMismatchCounts(final Pair<Long, Long> counts, final AlignmentContext context, final char ref) {
|
||||
for( PileupElement p : context.getPileup() ) {
|
||||
final char readChar = (char)(p.getBase());
|
||||
final int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
|
||||
|
|
@ -377,7 +377,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
final long curMismatches = datum.getNumMismatches();
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char then the bytes default to the (long, long) version of the increment method which is really bad
|
||||
countedBases++;
|
||||
novel_counts.second++;
|
||||
novel_counts.first += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
|
|
|
|||
|
|
@ -260,11 +260,11 @@ public class RecalDataManager {
|
|||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
}
|
||||
byte[] inconsistency = new byte[readBases.length];
|
||||
final byte[] inconsistency = new byte[readBases.length];
|
||||
int iii;
|
||||
byte prevBase = (byte) colorSpace[0]; // The sentinel
|
||||
for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
final byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
|
@ -289,9 +289,9 @@ public class RecalDataManager {
|
|||
* @param refBases The reference for this read
|
||||
* @return A new array of quality scores that have been ref bias corrected
|
||||
*/
|
||||
public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
|
||||
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
|
||||
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
char[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
|
|
@ -308,16 +308,16 @@ public class RecalDataManager {
|
|||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement( refBases );
|
||||
}
|
||||
int[] inconsistency = new int[readBases.length];
|
||||
final int[] inconsistency = new int[readBases.length];
|
||||
byte prevBase = (byte) colorSpace[0]; // The sentinel
|
||||
for( int iii = 0; iii < readBases.length; iii++ ) {
|
||||
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
final byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
colorImpliedBases[iii] = thisBase;
|
||||
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
||||
boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
|
||||
final boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
|
||||
|
||||
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
|
||||
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { // Set inconsistent bases and the one before it to Q0
|
||||
|
|
@ -328,7 +328,7 @@ public class RecalDataManager {
|
|||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
|
||||
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip);
|
||||
}
|
||||
}
|
||||
|
||||
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
|
||||
Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName());
|
||||
|
|
@ -350,27 +350,26 @@ public class RecalDataManager {
|
|||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores,
|
||||
private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores,
|
||||
final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) {
|
||||
|
||||
final boolean negStrand = read.getReadNegativeStrandFlag();
|
||||
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) {
|
||||
originalQualScores[iii] = (byte)0;
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; }
|
||||
else { originalQualScores[iii] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii] = (byte)'N'; }
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if( !isMappedToRef || (char)readBases[iii-1] == refBases[iii-1] ) {
|
||||
originalQualScores[iii-1] = (byte)0;
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; }
|
||||
else { originalQualScores[iii-1] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
|
||||
}
|
||||
//if( !isMappedToRef || (char)readBases[iii+1] == refBases[iii+1] ) {
|
||||
// originalQualScores[iii+1] = (byte)0;
|
||||
// if( setBaseN ) { readBases[iii+1] = (byte)'N'; }
|
||||
//}
|
||||
}
|
||||
}
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
if( negStrand ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
|
|
@ -387,10 +386,10 @@ public class RecalDataManager {
|
|||
* @param isMappedToRef Is this read mapped fully to a reference
|
||||
* @param coinFlip A random number generator
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases,
|
||||
final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) {
|
||||
private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases,
|
||||
final char[] refBases, final boolean isMappedToRef, final Random coinFlip) {
|
||||
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpaceQuals;
|
||||
if( attr instanceof String ) {
|
||||
|
|
@ -402,27 +401,32 @@ public class RecalDataManager {
|
|||
for( int iii = 1; iii < inconsistency.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read
|
||||
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
|
||||
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
int rand = coinFlip.nextInt( 2 );
|
||||
if( rand == 0 ) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if( numLow == 0 ) { numLow = 1; }
|
||||
int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
int rand = coinFlip.nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) {
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
|
||||
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = coinFlip.nextInt( 2 );
|
||||
if( rand == 0 ) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
if( minQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if( numLow == 0 ) {
|
||||
numLow++;
|
||||
diffInQuality++;
|
||||
}
|
||||
final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
final int rand = coinFlip.nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) { // higher q score won
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had higher q score, and won out, so nothing to do here
|
||||
} else { // lower q score won
|
||||
if( minQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had lower q score, and won out, so nothing to do here
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -457,7 +461,7 @@ public class RecalDataManager {
|
|||
case '3': // simple complement
|
||||
return BaseUtils.simpleComplement( prevBase );
|
||||
default:
|
||||
throw new StingException( "Unrecognized color space in SOLID bam, color = " + color );
|
||||
throw new StingException( "Unrecognized color space in SOLID read, color = " + color );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -468,9 +472,9 @@ public class RecalDataManager {
|
|||
* @return Returns true if the base is inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpace = (byte[])attr;
|
||||
final byte[] colorSpace = (byte[])attr;
|
||||
return colorSpace[offset] != 0;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.2.5"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.2.6"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
|
|
@ -175,7 +175,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
throw new StingException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
|
|
@ -259,7 +258,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(String line) {
|
||||
private void addCSVData(final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
|
|
@ -302,7 +301,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
byte[] originalQuals = read.getBaseQualities();
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
String platform = read.getReadGroup().getPlatform();
|
||||
final String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
||||
}
|
||||
|
|
@ -351,7 +350,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation(Object... key ) {
|
||||
private byte performSequentialQualityCalculation(final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
|
|
@ -413,7 +412,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
|
|
|
|||
|
|
@ -50,9 +50,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "6c59d291c37d053e0f188b762f3060a5" );
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "27b3eaf3c02ffc5fb3d7815468d9958e");
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "0f4a882d78747380af2822317c7b6045");
|
||||
e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "7ebdce416b72679e1cf88cc9886a5edc" );
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "a39afc94ed74f8137c9d43285997bd90" );
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "41b0313bfcaf5afb3cb85f9830e79769" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorMaxQ70() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "bf8d62beda7d3cf38d660d7939901192" );
|
||||
e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "8120b648a43b293b84a7cb6374513fb8" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -165,7 +165,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "ad345fcfb2faaf97eb0291ffa61b3228" );
|
||||
e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d9d29f38f8ea31ac267f9d937cad9291" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue