From 1e396af4d0ff764cad466e517033714ad9aa8ea5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 31 Jul 2013 10:05:47 -0400 Subject: [PATCH] Two reduce reads updates/fixes: 1. Removing old legacy code that was capping the positional depth for reduced reads to 127. Unfortunately this cap affectively performs biased down-sampling and throws off e.g. FS numbers. Added end to end unit test that depth counts in RR can be higher than max byte. Some md5s change in the RR tests because depths are now (correctly) no longer capped at 127. 2. Down-sampling in ReduceReads was not safe as it could remove het compressed consensus reads. Refactored it so that it can only remove non-consensus reads. --- .../reducereads/SlidingWindow.java | 28 ++++---- .../ReduceReadsIntegrationTest.java | 50 ++++++++++++- .../reducereads/SlidingWindowUnitTest.java | 70 +++++++++++++++++++ 3 files changed, 131 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index e15c68774..964c6b401 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -494,10 +494,10 @@ public class SlidingWindow { */ private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { final BaseIndex base = baseCounts.baseIndexWithMostProbability(); - byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); - byte qual = baseCounts.averageQualsOfBase(base); - byte insQual = baseCounts.averageInsertionQualsOfBase(base); - byte delQual = baseCounts.averageDeletionQualsOfBase(base); + final int count = baseCounts.countOfBase(base); + final byte qual = baseCounts.averageQualsOfBase(base); + final byte insQual = baseCounts.averageInsertionQualsOfBase(base); + final byte delQual = baseCounts.averageDeletionQualsOfBase(base); syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); } @@ -533,20 +533,24 @@ public class SlidingWindow { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); - final ObjectList toRemove = new ObjectArrayList<>(); + final ObjectList toRemoveFromWindow = new ObjectArrayList<>(); + final ObjectList toEmit = new ObjectArrayList<>(); for ( final GATKSAMRecord read : readsInWindow ) { if ( read.getSoftStart() <= refStop ) { if ( read.getAlignmentEnd() >= refStart ) { - allReads.reads.add(read); + toEmit.add(read); removeFromHeader(windowHeader, read); } - toRemove.add(read); + toRemoveFromWindow.add(read); } } // remove all used reads - for ( final GATKSAMRecord read : toRemove ) + for ( final GATKSAMRecord read : toRemoveFromWindow ) readsInWindow.remove(read); + + // down-sample the unreduced reads if needed + allReads.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(toEmit) : toEmit); } return allReads; @@ -632,12 +636,8 @@ public class SlidingWindow { @Ensures("result != null") protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); - - final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed); - result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads); - result.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1)); - - return result; // finalized reads will be downsampled if necessary + allReads.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1)); + return allReads; } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 9942821e1..067f36d58 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -46,9 +46,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; +import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; @@ -70,6 +75,8 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam"; + final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam"; + final String HIGH_COVERAGE_L = " -L 1:1650830-1650870"; final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; @@ -221,13 +228,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("2fdc77ff5139f62db9697427b559f866"))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68"))); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("6db7fca364ba64f7db9510b412d731f0"))); + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878"))); } @Test(enabled = true) @@ -258,7 +265,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("82758efda419011642cb468809a50bf9"))); + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a"))); } /** @@ -299,5 +306,42 @@ public class ReduceReadsIntegrationTest extends WalkerTest { String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class)); } + + /** + * Confirm that compression is not capping coverage counts to max byte + */ + @Test(enabled = true) + public void testCompressionWorksForHighDepth() { + final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s"; + final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth", + new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage + + boolean sawHighCoveragePosition = false; + final SAMFileReader reader = new SAMFileReader(outputBam); + reader.setSAMRecordFactory(new GATKSamRecordFactory()); + + for ( final SAMRecord rawRead : reader ) { + final GATKSAMRecord read = (GATKSAMRecord)rawRead; + read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); + + if ( ! read.isReducedRead() ) + continue; + + final int[] decodedCounts = read.getReducedReadCounts(); + for ( final int count : decodedCounts ) { + if ( count > Byte.MAX_VALUE ) { + sawHighCoveragePosition = true; + break; + } + } + + if ( sawHighCoveragePosition ) + break; + } + + reader.close(); + + Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!"); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index 3534284cd..720f00f98 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -619,6 +619,76 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); } + @DataProvider(name = "DownsamplingFromClose") + public Object[][] createDownsamplingFromCloseTestData() { + + final ObjectList myReads = new ObjectArrayList<>(20); + for ( int i = 0; i < 21; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + if ( i < 5 ) + bases[50] = 'C'; + read.setReadBases(bases); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + myReads.add(read); + } + + List tests = new ArrayList<>(); + + for ( int i = 1; i < 25; i++ ) + tests.add(new Object[]{myReads, i}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DownsamplingFromClose", enabled = true) + public void testDownsamplingTestFromClose(final ObjectList myReads, final int dcov) { + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); // no het compression + + Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly"); + } + + @DataProvider(name = "NoDownsamplingForConsensusReads") + public Object[][] createNoDownsamplingForConsensusReadsData() { + + final ObjectList myReads = new ObjectArrayList<>(20); + for ( int i = 0; i < 30; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + if ( i < 10 ) + bases[50] = 'C'; + read.setReadBases(bases); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + read.setReadNegativeStrandFlag(i % 2 == 0); + myReads.add(read); + } + + List tests = new ArrayList<>(); + + for ( int i = 0; i < 5; i++ ) + tests.add(new Object[]{myReads, i}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true) + public void testNoDownsamplingForConsensusReads(final ObjectList myReads, final int dcov) { + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads) + + Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!"); + } ////////////////////////////////////////////////////////////// //// This section tests the consensus base quals accuracy ////