Refactored the compression interface per sample in ReduceReadsa
The CompressionStash is now responsible for keeping track of all intervals that must be kept uncompressed by all samples. In general this is a list generated by a tumor sample that will enforce all normal samples to abide. - Updated ReduceReads integration tests - Sliding Window is now using the CompressionStash (single sample). DEV-104 #resolve #time 3m
This commit is contained in:
parent
b57df6cac8
commit
32ee2c7dff
|
|
@ -0,0 +1,21 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* A stash of regions that must be kept uncompressed in all samples
|
||||
*
|
||||
* In general, these are regions that were kept uncompressed by a tumor sample and we want to force
|
||||
* all other samples (normals and/or tumors) to also keep these regions uncompressed
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/15/12
|
||||
* Time: 4:08 PM
|
||||
*/
|
||||
public class CompressionStash extends TreeSet<SimpleGenomeLoc> {
|
||||
public CompressionStash() {
|
||||
super(new GenomeLocComparator());
|
||||
}
|
||||
}
|
||||
|
|
@ -55,11 +55,12 @@ public class MultiSampleCompressor implements Compressor {
|
|||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final int nContigs,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final boolean allowPolyploidReduction,
|
||||
final CompressionStash compressionStash) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -222,6 +222,8 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
|
||||
CompressionStash compressionStash = new CompressionStash();
|
||||
|
||||
SortedSet<GenomeLoc> intervalList;
|
||||
|
||||
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
|
|
@ -328,7 +330,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ public class SingleSampleCompressor implements Compressor {
|
|||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
final private int nContigs;
|
||||
final private boolean allowPolyploidReduction;
|
||||
final CompressionStash compressionStash;
|
||||
|
||||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
|
@ -33,7 +34,8 @@ public class SingleSampleCompressor implements Compressor {
|
|||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final int nContigs,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final boolean allowPolyploidReduction,
|
||||
final CompressionStash compressionStash) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
|
|
@ -44,6 +46,7 @@ public class SingleSampleCompressor implements Compressor {
|
|||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.nContigs = nContigs;
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
this.compressionStash = compressionStash;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -65,7 +68,7 @@ public class SingleSampleCompressor implements Compressor {
|
|||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash);
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
|
|
@ -56,6 +55,7 @@ public class SlidingWindow {
|
|||
private final int nContigs;
|
||||
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
private CompressionStash compressionStash;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
|
|
@ -87,7 +87,7 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
|
|
@ -118,6 +118,7 @@ public class SlidingWindow {
|
|||
this.nContigs = nContigs;
|
||||
|
||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||
this.compressionStash = compressionStash;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -145,7 +146,7 @@ public class SlidingWindow {
|
|||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
||||
*/
|
||||
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
||||
private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
||||
boolean foundStart = false;
|
||||
int variantRegionStartIndex = 0;
|
||||
for (int i=from; i<to; i++) {
|
||||
|
|
@ -154,10 +155,10 @@ public class SlidingWindow {
|
|||
foundStart = true;
|
||||
}
|
||||
else if(!variantSite[i] && foundStart) {
|
||||
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
|
||||
return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true));
|
||||
}
|
||||
}
|
||||
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
|
||||
return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -168,23 +169,22 @@ public class SlidingWindow {
|
|||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return a list with start/stops of variant regions following getNextVariantRegion description
|
||||
*/
|
||||
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
|
||||
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
|
||||
private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) {
|
||||
CompressionStash regions = new CompressionStash();
|
||||
int index = from;
|
||||
while(index < to) {
|
||||
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
|
||||
SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite);
|
||||
if (result == null)
|
||||
break;
|
||||
|
||||
regions.add(result);
|
||||
if (result.getSecond() < 0)
|
||||
if (result.getStop() < 0)
|
||||
break;
|
||||
index = result.getSecond() + 1;
|
||||
index = result.getStop() + 1;
|
||||
}
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the window can be slid given the new incoming read.
|
||||
*
|
||||
|
|
@ -203,7 +203,7 @@ public class SlidingWindow {
|
|||
boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex);
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
|
|
@ -567,26 +567,31 @@ public class SlidingWindow {
|
|||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
|
||||
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
|
||||
private List<GATKSAMRecord> closeVariantRegions(CompressionStash regions, boolean forceClose) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
if (!regions.isEmpty()) {
|
||||
int lastStop = -1;
|
||||
for (Pair<Integer, Integer> region : regions) {
|
||||
int start = region.getFirst();
|
||||
int stop = region.getSecond();
|
||||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
lastStop = stop;
|
||||
for (SimpleGenomeLoc region : regions) {
|
||||
int start = region.getStart();
|
||||
int stop = region.getStop();
|
||||
|
||||
if (!region.isFinished()) {
|
||||
if(forceClose) // region is unfinished but we're forcing the close of this window
|
||||
stop = windowHeader.size() - 1;
|
||||
else
|
||||
continue; // region is unfinished and we're not forcing the close of this window
|
||||
}
|
||||
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
lastStop = stop;
|
||||
}
|
||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
||||
|
||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
|
@ -626,7 +631,7 @@ public class SlidingWindow {
|
|||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1);
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
|
|
|
|||
|
|
@ -21,36 +21,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
|
||||
RRTest("testDefaultCompression ", L, "1f95f3193bd9f120a73c34a0087abaf6");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
|
||||
RRTest("testMultipleIntervals ", intervals, "79213d6ac68d56d4d72dcf511223e424");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "dab2aa8e3655139974bbe12a568363d9");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "1255245ed4ebeacda90f0dbb4e4da081");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -61,20 +61,20 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("4b590269cbe3574dbdd5bdc2bc6f5f1c")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
|
||||
* filtered out.
|
||||
*/
|
||||
@Test(enabled = false)
|
||||
@Test(enabled = true)
|
||||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("d8d066304f7c187f182bfb50f39baa0c")));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
|
||||
* with them.
|
||||
*
|
||||
* However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot
|
||||
* be created easily for small tasks that do not require the rigors of the GenomeLocParser validation
|
||||
*
|
||||
* SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should
|
||||
* only be used outside of the engine.
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/16/12
|
||||
* Time: 2:07 PM
|
||||
*/
|
||||
public class SimpleGenomeLoc extends GenomeLoc {
|
||||
private boolean finished;
|
||||
|
||||
public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) {
|
||||
super(contigName, contigIndex, start, stop);
|
||||
this.finished = finished;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return finished;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue