Two reduce reads updates/fixes:
1. Removing old legacy code that was capping the positional depth for reduced reads to 127. Unfortunately this cap affectively performs biased down-sampling and throws off e.g. FS numbers. Added end to end unit test that depth counts in RR can be higher than max byte. Some md5s change in the RR tests because depths are now (correctly) no longer capped at 127. 2. Down-sampling in ReduceReads was not safe as it could remove het compressed consensus reads. Refactored it so that it can only remove non-consensus reads.
This commit is contained in:
parent
ec3c885a25
commit
1e396af4d0
|
|
@ -494,10 +494,10 @@ public class SlidingWindow {
|
|||
*/
|
||||
private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) {
|
||||
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
|
||||
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfBase(base);
|
||||
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
final int count = baseCounts.countOfBase(base);
|
||||
final byte qual = baseCounts.averageQualsOfBase(base);
|
||||
final byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
final byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS());
|
||||
}
|
||||
|
||||
|
|
@ -533,20 +533,24 @@ public class SlidingWindow {
|
|||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
final ObjectList<GATKSAMRecord> toRemove = new ObjectArrayList<>();
|
||||
final ObjectList<GATKSAMRecord> toRemoveFromWindow = new ObjectArrayList<>();
|
||||
final ObjectList<GATKSAMRecord> toEmit = new ObjectArrayList<>();
|
||||
for ( final GATKSAMRecord read : readsInWindow ) {
|
||||
if ( read.getSoftStart() <= refStop ) {
|
||||
if ( read.getAlignmentEnd() >= refStart ) {
|
||||
allReads.reads.add(read);
|
||||
toEmit.add(read);
|
||||
removeFromHeader(windowHeader, read);
|
||||
}
|
||||
toRemove.add(read);
|
||||
toRemoveFromWindow.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// remove all used reads
|
||||
for ( final GATKSAMRecord read : toRemove )
|
||||
for ( final GATKSAMRecord read : toRemoveFromWindow )
|
||||
readsInWindow.remove(read);
|
||||
|
||||
// down-sample the unreduced reads if needed
|
||||
allReads.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(toEmit) : toEmit);
|
||||
}
|
||||
|
||||
return allReads;
|
||||
|
|
@ -632,12 +636,8 @@ public class SlidingWindow {
|
|||
@Ensures("result != null")
|
||||
protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions);
|
||||
|
||||
final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed);
|
||||
result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads);
|
||||
result.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1));
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
allReads.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1));
|
||||
return allReads;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -46,9 +46,14 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -70,6 +75,8 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
|
||||
final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
|
||||
final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam";
|
||||
final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam";
|
||||
final String HIGH_COVERAGE_L = " -L 1:1650830-1650870";
|
||||
final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam";
|
||||
final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam";
|
||||
|
||||
|
|
@ -221,13 +228,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("2fdc77ff5139f62db9697427b559f866")));
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReductionWithKnowns() {
|
||||
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("6db7fca364ba64f7db9510b412d731f0")));
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
@ -258,7 +265,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
// we expect to lose coverage due to the downsampling so don't run the systematic tests
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("82758efda419011642cb468809a50bf9")));
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -299,5 +306,42 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null";
|
||||
executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm that compression is not capping coverage counts to max byte
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testCompressionWorksForHighDepth() {
|
||||
final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s";
|
||||
final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth",
|
||||
new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage
|
||||
|
||||
boolean sawHighCoveragePosition = false;
|
||||
final SAMFileReader reader = new SAMFileReader(outputBam);
|
||||
reader.setSAMRecordFactory(new GATKSamRecordFactory());
|
||||
|
||||
for ( final SAMRecord rawRead : reader ) {
|
||||
final GATKSAMRecord read = (GATKSAMRecord)rawRead;
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG));
|
||||
|
||||
if ( ! read.isReducedRead() )
|
||||
continue;
|
||||
|
||||
final int[] decodedCounts = read.getReducedReadCounts();
|
||||
for ( final int count : decodedCounts ) {
|
||||
if ( count > Byte.MAX_VALUE ) {
|
||||
sawHighCoveragePosition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( sawHighCoveragePosition )
|
||||
break;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
||||
Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -619,6 +619,76 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size()));
|
||||
}
|
||||
|
||||
@DataProvider(name = "DownsamplingFromClose")
|
||||
public Object[][] createDownsamplingFromCloseTestData() {
|
||||
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<>(20);
|
||||
for ( int i = 0; i < 21; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength);
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
if ( i < 5 )
|
||||
bases[50] = 'C';
|
||||
read.setReadBases(bases);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( int i = 1; i < 25; i++ )
|
||||
tests.add(new Object[]{myReads, i});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "DownsamplingFromClose", enabled = true)
|
||||
public void testDownsamplingTestFromClose(final ObjectList<GATKSAMRecord> myReads, final int dcov) {
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet<GenomeLoc>()); // no het compression
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly");
|
||||
}
|
||||
|
||||
@DataProvider(name = "NoDownsamplingForConsensusReads")
|
||||
public Object[][] createNoDownsamplingForConsensusReadsData() {
|
||||
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<>(20);
|
||||
for ( int i = 0; i < 30; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength);
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
if ( i < 10 )
|
||||
bases[50] = 'C';
|
||||
read.setReadBases(bases);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
read.setReadNegativeStrandFlag(i % 2 == 0);
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( int i = 0; i < 5; i++ )
|
||||
tests.add(new Object[]{myReads, i});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true)
|
||||
public void testNoDownsamplingForConsensusReads(final ObjectList<GATKSAMRecord> myReads, final int dcov) {
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads)
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!");
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
//// This section tests the consensus base quals accuracy ////
|
||||
|
|
|
|||
Loading…
Reference in New Issue