The new recalibrator had been mimicking the behavior of the old one in that if there was no dinuc available (following a no-call base or at either end of a read), it didn't try to recalibrate. Now that Ryan has modularized the system, we no longer need to skip the base completely (we just need to skip the dinuc value)... which is good because the Picard people complained after realizing that cycle #1 never got recalibrated.
The major effects of this commit are as follows: 1. We no longer skip any good bases (of course, this change alone breaks every single integration test). 2. The dinuc covariate returns a "no dinuc" value for the first base of a read (but not for the last base anymore, since there is a valid dinuc) or if the previous base is a bad base (e.g. 'N'). I've done a bunch of testing on real data and everything looks right; however, let's wait until the recalibrator guru gets back from vacation next week and can double-check everything before shipping this out in another early access release. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2443 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
aaf674d9db
commit
438d21842a
|
|
@ -258,7 +258,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
SAMRecord read;
|
SAMRecord read;
|
||||||
int offset;
|
int offset;
|
||||||
byte refBase;
|
byte refBase;
|
||||||
byte prevBase;
|
|
||||||
byte[] bases;
|
byte[] bases;
|
||||||
|
|
||||||
// For each read at this locus
|
// For each read at this locus
|
||||||
|
|
@ -269,39 +268,29 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
RecalDataManager.parseSAMRecord( read, RAC );
|
RecalDataManager.parseSAMRecord( read, RAC );
|
||||||
RecalDataManager.parseColorSpace( read );
|
RecalDataManager.parseColorSpace( read );
|
||||||
|
|
||||||
// Skip first and last base because there is no dinuc
|
// Skip if base quality is zero
|
||||||
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
if( read.getBaseQualities()[offset] > 0 ) {
|
||||||
if( offset > 0 && offset < read.getReadLength() - 1) {
|
|
||||||
// Skip if base quality is zero
|
|
||||||
if( read.getBaseQualities()[offset] > 0 ) {
|
|
||||||
|
|
||||||
bases = read.getReadBases();
|
bases = read.getReadBases();
|
||||||
refBase = (byte)ref.getBase();
|
refBase = (byte)ref.getBase();
|
||||||
prevBase = bases[offset - 1];
|
|
||||||
|
|
||||||
// DinucCovariate is responsible for getting the complement bases if needed
|
// Skip if this base is an 'N' or etc.
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
||||||
prevBase = bases[offset + 1];
|
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||||
}
|
|
||||||
|
|
||||||
// Skip if this base or the previous one was an 'N' or etc.
|
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||||
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
||||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
|
||||||
|
|
||||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||||
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
updateDataFromRead( read, offset, refBase );
|
||||||
|
|
||||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
} else { // calculate SOLID reference insertion rate
|
||||||
updateDataFromRead( read, offset, refBase );
|
if( ref.getBase() == (char)bases[offset] ) {
|
||||||
|
solidInsertedReferenceBases++;
|
||||||
} else { // calculate SOLID reference insertion rate
|
} else {
|
||||||
if( ref.getBase() == (char)bases[offset] ) {
|
otherColorSpaceInconsistency++;
|
||||||
solidInsertedReferenceBases++;
|
|
||||||
} else {
|
|
||||||
otherColorSpaceInconsistency++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -380,7 +369,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
|
|
||||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||||
for( Covariate covariate : requestedCovariates ) {
|
for( Covariate covariate : requestedCovariates ) {
|
||||||
key.add( covariate.getValue( read, offset ) );
|
key.add(covariate.getValue( read, offset ));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||||
|
|
|
||||||
|
|
@ -141,7 +141,6 @@ public class CycleCovariate implements StandardCovariate {
|
||||||
return getValue( read, offset ); // a recursive call
|
return getValue( read, offset ); // a recursive call
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO -- Ryan: sanity check me [EB]
|
|
||||||
// differentiate between first and second of pair
|
// differentiate between first and second of pair
|
||||||
if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() )
|
if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() )
|
||||||
cycle *= -1;
|
cycle *= -1;
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,9 @@ import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
public class DinucCovariate implements StandardCovariate {
|
public class DinucCovariate implements StandardCovariate {
|
||||||
|
|
||||||
|
private static final byte NO_CALL = (byte)'N';
|
||||||
|
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
|
||||||
|
|
||||||
HashMap<Integer, Dinuc> dinucHashMap;
|
HashMap<Integer, Dinuc> dinucHashMap;
|
||||||
|
|
||||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||||
|
|
@ -53,6 +56,8 @@ public class DinucCovariate implements StandardCovariate {
|
||||||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// add the "no dinuc" entry too
|
||||||
|
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to pick out the covariate's value from attributes of the read
|
// Used to pick out the covariate's value from attributes of the read
|
||||||
|
|
@ -63,12 +68,23 @@ public class DinucCovariate implements StandardCovariate {
|
||||||
byte[] bases = read.getReadBases();
|
byte[] bases = read.getReadBases();
|
||||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
if( read.getReadNegativeStrandFlag() ) {
|
||||||
|
// no dinuc at the beginning of the read
|
||||||
|
if( offset == bases.length-1 )
|
||||||
|
return NO_DINUC;
|
||||||
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
||||||
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
||||||
} else {
|
} else {
|
||||||
|
// no dinuc at the beginning of the read
|
||||||
|
if( offset == 0 )
|
||||||
|
return NO_DINUC;
|
||||||
base = bases[offset];
|
base = bases[offset];
|
||||||
prevBase = bases[offset - 1];
|
prevBase = bases[offset - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// make sure the previous base is good
|
||||||
|
if( !BaseUtils.isRegularBase(prevBase) )
|
||||||
|
return NO_DINUC;
|
||||||
|
|
||||||
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -468,8 +468,9 @@ public class RecalDataManager {
|
||||||
* @return Returns true if the base is inconsistent with the color space
|
* @return Returns true if the base is inconsistent with the color space
|
||||||
*/
|
*/
|
||||||
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
||||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) {
|
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||||
byte[] colorSpace = ((byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG));
|
if( attr != null ) {
|
||||||
|
byte[] colorSpace = (byte[])attr;
|
||||||
return colorSpace[offset] != 0;
|
return colorSpace[offset] != 0;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -310,17 +310,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
||||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
||||||
}
|
}
|
||||||
|
|
||||||
int startPos = 1;
|
|
||||||
int stopPos = read.getReadLength();
|
|
||||||
|
|
||||||
if( read.getReadNegativeStrandFlag() ) { // DinucCovariate is responsible for getting the complement base if needed
|
|
||||||
startPos = 0;
|
|
||||||
stopPos = read.getReadLength() - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// For each base in the read
|
// For each base in the read
|
||||||
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
|
int readLength = read.getReadLength();
|
||||||
|
for( int iii = 0; iii < readLength; iii++ ) {
|
||||||
|
|
||||||
// Get the covariate values which make up the key
|
// Get the covariate values which make up the key
|
||||||
for( Covariate covariate : requestedCovariates ) {
|
for( Covariate covariate : requestedCovariates ) {
|
||||||
|
|
|
||||||
|
|
@ -16,10 +16,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCountCovariates1() {
|
public void testCountCovariates1() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "9eddef42ae847f2edd966d3a5b463e50" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" );
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "22ee532c7e521d8cbfabedb07673429d");
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873");
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "8b91c48f186c1e44723ac4e047ac553c" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" );
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f586fcd8642deae6504d39430b20b713" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testTableRecalibrator1() {
|
public void testTableRecalibrator1() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d4153765625a451bee768ad46775b3" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" );
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5954b7246d2e0c5c65267026819d5933");
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a");
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "91fd3c557db56fd969523d8c06d71a0b" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" );
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "9655f57949b01d6ee8a7c9b92e61a47f" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCountCovariatesVCF() {
|
public void testCountCovariatesVCF() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "79e6738031f5a6f993838b33d70716c3");
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7");
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCountCovariatesNoReadGroups() {
|
public void testCountCovariatesNoReadGroups() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d6f3ea0cf2129ca5488977f087f1aa25" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testTableRecalibratorNoReadGroups() {
|
public void testTableRecalibratorNoReadGroups() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "c0d5fdd6e07395ae2d7b08431995b30d" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -166,7 +166,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testCountCovariatesNoIndex() {
|
public void testCountCovariatesNoIndex() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f5a0adce5458ab661029fa5056a55551" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "906e5a08401722cc9a5528d2fd20ea6a" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
@ -193,7 +193,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testTableRecalibratorNoIndex() {
|
public void testTableRecalibratorNoIndex() {
|
||||||
HashMap<String, String> e = new HashMap<String, String>();
|
HashMap<String, String> e = new HashMap<String, String>();
|
||||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "33cb41fa3e959ac15ac9af329167a736" );
|
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aa38b04c6b58badabb6b09d590284a2a" );
|
||||||
|
|
||||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||||
String bam = entry.getKey();
|
String bam = entry.getKey();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue