From 438d21842a940782961c2ffd7bf84822954105c7 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 24 Dec 2009 20:41:29 +0000 Subject: [PATCH] The new recalibrator had been mimicking the behavior of the old one in that if there was no dinuc available (following a no-call base or at either end of a read), it didn't try to recalibrate. Now that Ryan has modularized the system, we no longer need to skip the base completely (we just need to skip the dinuc value)... which is good because the Picard people complained after realizing that cycle #1 never got recalibrated. The major effects of this commit are as follows: 1. We no longer skip any good bases (of course, this change alone breaks every single integration test). 2. The dinuc covariate returns a "no dinuc" value for the first base of a read (but not for the last base anymore, since there is a valid dinuc) or if the previous base is a bad base (e.g. 'N'). I've done a bunch of testing on real data and everything looks right; however, let's wait until the recalibrator guru gets back from vacation next week and can double-check everything before shipping this out in another early access release. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2443 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/CovariateCounterWalker.java | 47 +++++++------------ .../walkers/recalibration/CycleCovariate.java | 1 - .../walkers/recalibration/DinucCovariate.java | 16 +++++++ .../recalibration/RecalDataManager.java | 5 +- .../TableRecalibrationWalker.java | 11 +---- .../RecalibrationWalkersIntegrationTest.java | 26 +++++----- 6 files changed, 52 insertions(+), 54 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 15186f166..1ce05679c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -258,7 +258,6 @@ public class CovariateCounterWalker extends LocusWalker { SAMRecord read; int offset; byte refBase; - byte prevBase; byte[] bases; // For each read at this locus @@ -269,39 +268,29 @@ public class CovariateCounterWalker extends LocusWalker { RecalDataManager.parseSAMRecord( read, RAC ); RecalDataManager.parseColorSpace( read ); - // Skip first and last base because there is no dinuc - // BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests. - if( offset > 0 && offset < read.getReadLength() - 1) { - // Skip if base quality is zero - if( read.getBaseQualities()[offset] > 0 ) { + // Skip if base quality is zero + if( read.getBaseQualities()[offset] > 0 ) { - bases = read.getReadBases(); - refBase = (byte)ref.getBase(); - prevBase = bases[offset - 1]; + bases = read.getReadBases(); + refBase = (byte)ref.getBase(); - // DinucCovariate is responsible for getting the complement bases if needed - if( read.getReadNegativeStrandFlag() ) { - prevBase = bases[offset + 1]; - } + // Skip if this base is an 'N' or etc. + // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. + if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { - // Skip if this base or the previous one was an 'N' or etc. - // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. - if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { + // This base finally passed all the checks for a good base, so add it to the big data hashmap + updateDataFromRead( read, offset, refBase ); - // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( read, offset, refBase ); - - } else { // calculate SOLID reference insertion rate - if( ref.getBase() == (char)bases[offset] ) { - solidInsertedReferenceBases++; - } else { - otherColorSpaceInconsistency++; - } + } else { // calculate SOLID reference insertion rate + if( ref.getBase() == (char)bases[offset] ) { + solidInsertedReferenceBases++; + } else { + otherColorSpaceInconsistency++; } - } + } } } } @@ -380,7 +369,7 @@ public class CovariateCounterWalker extends LocusWalker { // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( read, offset ) ); + key.add(covariate.getValue( read, offset )); } // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index bc57f754f..fab76357d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -141,7 +141,6 @@ public class CycleCovariate implements StandardCovariate { return getValue( read, offset ); // a recursive call } - // TODO -- Ryan: sanity check me [EB] // differentiate between first and second of pair if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) cycle *= -1; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index 1278e7292..a2a7d79d4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -42,6 +42,9 @@ import net.sf.samtools.SAMRecord; public class DinucCovariate implements StandardCovariate { + private static final byte NO_CALL = (byte)'N'; + private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); + HashMap dinucHashMap; // Initialize any member variables using the command-line arguments passed to the walkers @@ -53,6 +56,8 @@ public class DinucCovariate implements StandardCovariate { dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow } } + // add the "no dinuc" entry too + dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC); } // Used to pick out the covariate's value from attributes of the read @@ -63,12 +68,23 @@ public class DinucCovariate implements StandardCovariate { byte[] bases = read.getReadBases(); // If this is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { + // no dinuc at the beginning of the read + if( offset == bases.length-1 ) + return NO_DINUC; base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) ); prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) ); } else { + // no dinuc at the beginning of the read + if( offset == 0 ) + return NO_DINUC; base = bases[offset]; prevBase = bases[offset - 1]; } + + // make sure the previous base is good + if( !BaseUtils.isRegularBase(prevBase) ) + return NO_DINUC; + return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index cd2c74273..c54e34375 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -468,8 +468,9 @@ public class RecalDataManager { * @return Returns true if the base is inconsistent with the color space */ public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { - if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) { - byte[] colorSpace = ((byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG)); + Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); + if( attr != null ) { + byte[] colorSpace = (byte[])attr; return colorSpace[offset] != 0; } else { return false; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index bb3b79161..b61e04a7a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -310,17 +310,10 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "9eddef42ae847f2edd966d3a5b463e50" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "22ee532c7e521d8cbfabedb07673429d"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "8b91c48f186c1e44723ac4e047ac553c" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "f586fcd8642deae6504d39430b20b713" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibrator1() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d4153765625a451bee768ad46775b3" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5954b7246d2e0c5c65267026819d5933"); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "91fd3c557db56fd969523d8c06d71a0b" ); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "9655f57949b01d6ee8a7c9b92e61a47f" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCF() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "79e6738031f5a6f993838b33d70716c3"); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d6f3ea0cf2129ca5488977f087f1aa25" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoReadGroups() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "c0d5fdd6e07395ae2d7b08431995b30d" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -166,7 +166,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "f5a0adce5458ab661029fa5056a55551" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "906e5a08401722cc9a5528d2fd20ea6a" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -193,7 +193,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "33cb41fa3e959ac15ac9af329167a736" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "aa38b04c6b58badabb6b09d590284a2a" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();