The new recalibrator had been mimicking the behavior of the old one in that if there was no dinuc available (following a no-call base or at either end of a read), it didn't try to recalibrate. Now that Ryan has modularized the system, we no longer need to skip the base completely (we just need to skip the dinuc value)... which is good because the Picard people complained after realizing that cycle #1 never got recalibrated.
The major effects of this commit are as follows: 1. We no longer skip any good bases (of course, this change alone breaks every single integration test). 2. The dinuc covariate returns a "no dinuc" value for the first base of a read (but not for the last base anymore, since there is a valid dinuc) or if the previous base is a bad base (e.g. 'N'). I've done a bunch of testing on real data and everything looks right; however, let's wait until the recalibrator guru gets back from vacation next week and can double-check everything before shipping this out in another early access release. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2443 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
aaf674d9db
commit
438d21842a
|
|
@ -258,7 +258,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
SAMRecord read;
|
||||
int offset;
|
||||
byte refBase;
|
||||
byte prevBase;
|
||||
byte[] bases;
|
||||
|
||||
// For each read at this locus
|
||||
|
|
@ -269,39 +268,29 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
RecalDataManager.parseColorSpace( read );
|
||||
|
||||
// Skip first and last base because there is no dinuc
|
||||
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
||||
if( offset > 0 && offset < read.getReadLength() - 1) {
|
||||
// Skip if base quality is zero
|
||||
if( read.getBaseQualities()[offset] > 0 ) {
|
||||
// Skip if base quality is zero
|
||||
if( read.getBaseQualities()[offset] > 0 ) {
|
||||
|
||||
bases = read.getReadBases();
|
||||
refBase = (byte)ref.getBase();
|
||||
prevBase = bases[offset - 1];
|
||||
bases = read.getReadBases();
|
||||
refBase = (byte)ref.getBase();
|
||||
|
||||
// DinucCovariate is responsible for getting the complement bases if needed
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
prevBase = bases[offset + 1];
|
||||
}
|
||||
// Skip if this base is an 'N' or etc.
|
||||
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
||||
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
|
||||
// Skip if this base or the previous one was an 'N' or etc.
|
||||
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( ref.getBase() == (char)bases[offset] ) {
|
||||
solidInsertedReferenceBases++;
|
||||
} else {
|
||||
otherColorSpaceInconsistency++;
|
||||
}
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( ref.getBase() == (char)bases[offset] ) {
|
||||
solidInsertedReferenceBases++;
|
||||
} else {
|
||||
otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -380,7 +369,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
key.add( covariate.getValue( read, offset ) );
|
||||
key.add(covariate.getValue( read, offset ));
|
||||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
|
|
|
|||
|
|
@ -141,7 +141,6 @@ public class CycleCovariate implements StandardCovariate {
|
|||
return getValue( read, offset ); // a recursive call
|
||||
}
|
||||
|
||||
// TODO -- Ryan: sanity check me [EB]
|
||||
// differentiate between first and second of pair
|
||||
if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() )
|
||||
cycle *= -1;
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ import net.sf.samtools.SAMRecord;
|
|||
|
||||
public class DinucCovariate implements StandardCovariate {
|
||||
|
||||
private static final byte NO_CALL = (byte)'N';
|
||||
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
|
||||
|
||||
HashMap<Integer, Dinuc> dinucHashMap;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
|
|
@ -53,6 +56,8 @@ public class DinucCovariate implements StandardCovariate {
|
|||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||
}
|
||||
}
|
||||
// add the "no dinuc" entry too
|
||||
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -63,12 +68,23 @@ public class DinucCovariate implements StandardCovariate {
|
|||
byte[] bases = read.getReadBases();
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
// no dinuc at the beginning of the read
|
||||
if( offset == bases.length-1 )
|
||||
return NO_DINUC;
|
||||
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
||||
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
||||
} else {
|
||||
// no dinuc at the beginning of the read
|
||||
if( offset == 0 )
|
||||
return NO_DINUC;
|
||||
base = bases[offset];
|
||||
prevBase = bases[offset - 1];
|
||||
}
|
||||
|
||||
// make sure the previous base is good
|
||||
if( !BaseUtils.isRegularBase(prevBase) )
|
||||
return NO_DINUC;
|
||||
|
||||
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -468,8 +468,9 @@ public class RecalDataManager {
|
|||
* @return Returns true if the base is inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) {
|
||||
byte[] colorSpace = ((byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG));
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpace = (byte[])attr;
|
||||
return colorSpace[offset] != 0;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -310,17 +310,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
||||
}
|
||||
|
||||
int startPos = 1;
|
||||
int stopPos = read.getReadLength();
|
||||
|
||||
if( read.getReadNegativeStrandFlag() ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||
startPos = 0;
|
||||
stopPos = read.getReadLength() - 1;
|
||||
}
|
||||
|
||||
// For each base in the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
|
||||
int readLength = read.getReadLength();
|
||||
for( int iii = 0; iii < readLength; iii++ ) {
|
||||
|
||||
// Get the covariate values which make up the key
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
|
|
|
|||
|
|
@ -16,10 +16,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariates1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "9eddef42ae847f2edd966d3a5b463e50" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "22ee532c7e521d8cbfabedb07673429d");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "8b91c48f186c1e44723ac4e047ac553c" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f586fcd8642deae6504d39430b20b713" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d4153765625a451bee768ad46775b3" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5954b7246d2e0c5c65267026819d5933");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "91fd3c557db56fd969523d8c06d71a0b" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "9655f57949b01d6ee8a7c9b92e61a47f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesVCF() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "79e6738031f5a6f993838b33d70716c3");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d6f3ea0cf2129ca5488977f087f1aa25" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "c0d5fdd6e07395ae2d7b08431995b30d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -166,7 +166,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesNoIndex() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f5a0adce5458ab661029fa5056a55551" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "906e5a08401722cc9a5528d2fd20ea6a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -193,7 +193,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorNoIndex() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "33cb41fa3e959ac15ac9af329167a736" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aa38b04c6b58badabb6b09d590284a2a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue