The new recalibrator had been mimicking the behavior of the old one in that if there was no dinuc available (following a no-call base or at either end of a read), it didn't try to recalibrate. Now that Ryan has modularized the system, we no longer need to skip the base completely (we just need to skip the dinuc value)... which is good because the Picard people complained after realizing that cycle #1 never got recalibrated.

The major effects of this commit are as follows:
1. We no longer skip any good bases (of course, this change alone breaks every single integration test).
2. The dinuc covariate returns a "no dinuc" value for the first base of a read (but not for the last base anymore, since there is a valid dinuc) or if the previous base is a bad base (e.g. 'N').

I've done a bunch of testing on real data and everything looks right; however, let's wait until the recalibrator guru gets back from vacation next week and can double-check everything before shipping this out in another early access release.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2443 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-24 20:41:29 +00:00
parent aaf674d9db
commit 438d21842a
6 changed files with 52 additions and 54 deletions

View File

@ -258,7 +258,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
SAMRecord read;
int offset;
byte refBase;
byte prevBase;
byte[] bases;
// For each read at this locus
@ -269,39 +268,29 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
RecalDataManager.parseSAMRecord( read, RAC );
RecalDataManager.parseColorSpace( read );
// Skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 && offset < read.getReadLength() - 1) {
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
bases = read.getReadBases();
refBase = (byte)ref.getBase();
prevBase = bases[offset - 1];
bases = read.getReadBases();
refBase = (byte)ref.getBase();
// DinucCovariate is responsible for getting the complement bases if needed
if( read.getReadNegativeStrandFlag() ) {
prevBase = bases[offset + 1];
}
// Skip if this base is an 'N' or etc.
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// Skip if this base or the previous one was an 'N' or etc.
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase );
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase );
} else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) {
solidInsertedReferenceBases++;
} else {
otherColorSpaceInconsistency++;
}
} else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) {
solidInsertedReferenceBases++;
} else {
otherColorSpaceInconsistency++;
}
}
}
}
}
}
@ -380,7 +369,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) {
key.add( covariate.getValue( read, offset ) );
key.add(covariate.getValue( read, offset ));
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap

View File

@ -141,7 +141,6 @@ public class CycleCovariate implements StandardCovariate {
return getValue( read, offset ); // a recursive call
}
// TODO -- Ryan: sanity check me [EB]
// differentiate between first and second of pair
if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() )
cycle *= -1;

View File

@ -42,6 +42,9 @@ import net.sf.samtools.SAMRecord;
public class DinucCovariate implements StandardCovariate {
private static final byte NO_CALL = (byte)'N';
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
HashMap<Integer, Dinuc> dinucHashMap;
// Initialize any member variables using the command-line arguments passed to the walkers
@ -53,6 +56,8 @@ public class DinucCovariate implements StandardCovariate {
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
}
}
// add the "no dinuc" entry too
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
}
// Used to pick out the covariate's value from attributes of the read
@ -63,12 +68,23 @@ public class DinucCovariate implements StandardCovariate {
byte[] bases = read.getReadBases();
// If this is a negative strand read then we need to reverse the direction for our previous base
if( read.getReadNegativeStrandFlag() ) {
// no dinuc at the beginning of the read
if( offset == bases.length-1 )
return NO_DINUC;
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
} else {
// no dinuc at the beginning of the read
if( offset == 0 )
return NO_DINUC;
base = bases[offset];
prevBase = bases[offset - 1];
}
// make sure the previous base is good
if( !BaseUtils.isRegularBase(prevBase) )
return NO_DINUC;
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
}

View File

@ -468,8 +468,9 @@ public class RecalDataManager {
* @return Returns true if the base is inconsistent with the color space
*/
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) {
byte[] colorSpace = ((byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG));
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
if( attr != null ) {
byte[] colorSpace = (byte[])attr;
return colorSpace[offset] != 0;
} else {
return false;

View File

@ -310,17 +310,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
}
int startPos = 1;
int stopPos = read.getReadLength();
if( read.getReadNegativeStrandFlag() ) { // DinucCovariate is responsible for getting the complement base if needed
startPos = 0;
stopPos = read.getReadLength() - 1;
}
// For each base in the read
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
int readLength = read.getReadLength();
for( int iii = 0; iii < readLength; iii++ ) {
// Get the covariate values which make up the key
for( Covariate covariate : requestedCovariates ) {

View File

@ -16,10 +16,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testCountCovariates1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "9eddef42ae847f2edd966d3a5b463e50" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "22ee532c7e521d8cbfabedb07673429d");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "8b91c48f186c1e44723ac4e047ac553c" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f586fcd8642deae6504d39430b20b713" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibrator1() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d4153765625a451bee768ad46775b3" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5954b7246d2e0c5c65267026819d5933");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "91fd3c557db56fd969523d8c06d71a0b" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "9655f57949b01d6ee8a7c9b92e61a47f" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testCountCovariatesVCF() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "79e6738031f5a6f993838b33d70716c3");
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7");
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testCountCovariatesNoReadGroups() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d6f3ea0cf2129ca5488977f087f1aa25" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorNoReadGroups() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "c0d5fdd6e07395ae2d7b08431995b30d" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -166,7 +166,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testCountCovariatesNoIndex() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f5a0adce5458ab661029fa5056a55551" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "906e5a08401722cc9a5528d2fd20ea6a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();
@ -193,7 +193,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testTableRecalibratorNoIndex() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "33cb41fa3e959ac15ac9af329167a736" );
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aa38b04c6b58badabb6b09d590284a2a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String bam = entry.getKey();