From 62dd2fa5be28c959780a1488612a628cb68f5b8a Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 11 Jan 2010 14:28:52 +0000 Subject: [PATCH] Fixing another bug in solid recal regarding negative strand reads. The isInconsistentColorSpace method incorrectly used the inconsistent tag added by parseColorSpace, the inconsistent tag is in the direction of the read like the color space tag, and not in the direction of the reference like everything else. This affects the recalibrated quality scores but the improvment in SNP calling performance is minor when using the default UG settings (min base quality 10). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2553 348d0f76-0448-11de-a6fe-93d51630548a --- .../recalibration/CovariateCounterWalker.java | 6 +-- .../recalibration/RecalDataManager.java | 45 +++++++++++++++---- .../RecalibrationArgumentCollection.java | 2 +- .../TableRecalibrationWalker.java | 18 ++++---- .../RecalibrationWalkersIntegrationTest.java | 16 +++---- 5 files changed, 57 insertions(+), 30 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 9eebf9223..b814db015 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker { private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.2.3"; // Major version, minor version, and build number + private static final String versionString = "v2.2.4"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -170,7 +170,7 @@ public class CovariateCounterWalker extends LocusWalker { if( USE_STANDARD_COVARIATES ) { for( Class covClass : standardClasses ) { try { - Covariate covariate = (Covariate)covClass.newInstance(); + final Covariate covariate = (Covariate)covClass.newInstance(); requestedCovariates.add( covariate ); } catch ( InstantiationException e ) { throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); @@ -189,7 +189,7 @@ public class CovariateCounterWalker extends LocusWalker { if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { try { // Now that we've found a matching class, try to instantiate it - Covariate covariate = (Covariate)covClass.newInstance(); + final Covariate covariate = (Covariate)covClass.newInstance(); requestedCovariates.add( covariate ); } catch ( InstantiationException e ) { throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 07cd9a175..c8ea00bd1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -45,10 +45,10 @@ import net.sf.samtools.SAMReadGroupRecord; public class RecalDataManager { - public NestedHashMap data; // The full dataset - private NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed - private NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed - private ArrayList dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed + public final NestedHashMap data; // The full dataset + private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed + private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed + private final ArrayList dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams @@ -59,10 +59,14 @@ public class RecalDataManager { RecalDataManager() { data = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; } RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + data = null; dataCollapsedReadGroup = new NestedHashMap(); dataCollapsedQualityScore = new NestedHashMap(); dataCollapsedByCovariate = new ArrayList(); @@ -71,6 +75,9 @@ public class RecalDataManager { } } else { data = new NestedHashMap(); + dataCollapsedReadGroup = null; + dataCollapsedQualityScore = null; + dataCollapsedByCovariate = null; } } @@ -321,10 +328,10 @@ public class RecalDataManager { // Now that we have the inconsistency array apply the desired correction to the inconsistent bases if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { // Set inconsistent bases and the one before it to Q0 - boolean setBaseN = false; + final boolean setBaseN = false; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") ) { - boolean setBaseN = true; + final boolean setBaseN = true; originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip); @@ -469,13 +476,33 @@ public class RecalDataManager { * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality * @param read The read which contains the color space to check against * @param offset The offset in the read at which to check - * @return Returns true if the base is inconsistent with the color space + * @return Returns true if the base was inconsistent with the color space */ public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); if( attr != null ) { - final byte[] colorSpace = (byte[])attr; - return colorSpace[offset] != 0; + final byte[] inconsistency = (byte[])attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if( read.getReadNegativeStrandFlag() ) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] != 0; + } else { // Forward direction + return inconsistency[offset] != 0; + } + + // This block of code is if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} } else { return false; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 6e307b705..ecc07afec 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -63,7 +63,7 @@ public class RecalibrationArgumentCollection { public boolean EXCEPTION_IF_NO_TILE = false; - public boolean checkSolidRecalMode() { + public final boolean checkSolidRecalMode() { return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 16bcdd2a3..40adfeab4 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); + final List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); int lineNumber = 0; boolean foundAllCovariates = false; @@ -223,7 +223,7 @@ public class TableRecalibrationWalker extends ReadWalker adList = atd.createArgumentDefinitions(new ArgumentSource(field.getType(), field)); + final ArgumentTypeDescriptor atd = ArgumentTypeDescriptor.create(field.getType()); + final List adList = atd.createArgumentDefinitions(new ArgumentSource(field.getType(), field)); for( ArgumentDefinition ad : adList ) { commandLineString += (ad.fullName + "=" + JVMUtils.getFieldValue(field, RAC) + ", "); } @@ -355,8 +355,8 @@ public class TableRecalibrationWalker extends ReadWalker e = new HashMap(); e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "e5b2d5a2f4283718dae678cbc84be847" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "ff1b3a18b67b09560cacc3b5dea0a034"); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "89084b43b824f9e3c5e2afdfe0930542"); e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "7d6428a76e07ed4b99351aa4df89634d" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "a93b0263acdc856b885f95848852140d" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "6ede6fc840c4e5070a58a919b48e7504" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -50,9 +50,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { public void testTableRecalibrator1() { HashMap e = new HashMap(); e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "6c59d291c37d053e0f188b762f3060a5" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "0f4a882d78747380af2822317c7b6045"); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "e06f1397b9c40f75e96cd3df76730ee0"); e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "7ebdce416b72679e1cf88cc9886a5edc" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "41b0313bfcaf5afb3cb85f9830e79769" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "48ddc93cae054f9423f3a7ed9f36540e" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "8120b648a43b293b84a7cb6374513fb8" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "665711dfb81d67582b28faea24e26160" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -109,7 +109,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCF() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "d90342547ed228cf446caf594586f4b0"); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "9b9d21ffb70f15ef2aebad21f3fc05cb"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -136,7 +136,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesNoReadGroups() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "7e3045dcb2da1f4b305db7fa72bd1b51" ); + e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "a86c64f649b847b7f81ac50a808d3d45" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -165,7 +165,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoReadGroups() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "d9d29f38f8ea31ac267f9d937cad9291" ); + e.put( validationDataLocation + "NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "32ad300e8c094ed2c1ec6c531180fe70" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey();