Co-Reduction implementation in ReduceReads
ReduceReads now co-reduces bams if they're passed in toghether with multiple -I. Co-reduction forces every variant region in one sample to be a variant region in all samples. Also: * Added integrationtest for co-reduction * Fixed bug with new no-recalculation implementation of the marksites object where the last object wasn't being removed after finalizing a variant region (updated MD5's accordingly) DEV-200 #resolve #time 8m
This commit is contained in:
parent
2ec3852acd
commit
2c0bf89961
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -18,4 +19,41 @@ public class CompressionStash extends TreeSet<SimpleGenomeLoc> {
|
||||||
public CompressionStash() {
|
public CompressionStash() {
|
||||||
super(new GenomeLocComparator());
|
super(new GenomeLocComparator());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc
|
||||||
|
* in the stash.
|
||||||
|
*
|
||||||
|
* @param insertLoc the new loc to be inserted
|
||||||
|
* @return true if the loc, or it's merged version, wasn't present in the list before.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean add(SimpleGenomeLoc insertLoc) {
|
||||||
|
TreeSet<SimpleGenomeLoc> removedLocs = new TreeSet<SimpleGenomeLoc>();
|
||||||
|
for (SimpleGenomeLoc existingLoc : this) {
|
||||||
|
if (existingLoc.isPast(insertLoc)) {
|
||||||
|
break; // if we're past the loc we're done looking for overlaps.
|
||||||
|
}
|
||||||
|
if (existingLoc.equals(insertLoc)) {
|
||||||
|
return false; // if this loc was already present in the stash, we don't need to insert it.
|
||||||
|
}
|
||||||
|
if (existingLoc.contiguousP(insertLoc)) {
|
||||||
|
removedLocs.add(existingLoc); // list the original loc for merging
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (SimpleGenomeLoc loc : removedLocs) {
|
||||||
|
this.remove(loc); // remove all locs that will be merged
|
||||||
|
}
|
||||||
|
removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged
|
||||||
|
return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean addAll(Collection<? extends SimpleGenomeLoc> locs) {
|
||||||
|
boolean result = false;
|
||||||
|
for (SimpleGenomeLoc loc : locs) {
|
||||||
|
result |= this.add(loc);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.SampleUtils;
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.SortedSet;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
@ -41,7 +42,7 @@ import java.util.TreeSet;
|
||||||
*
|
*
|
||||||
* @author depristo
|
* @author depristo
|
||||||
*/
|
*/
|
||||||
public class MultiSampleCompressor implements Compressor {
|
public class MultiSampleCompressor {
|
||||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||||
|
|
||||||
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
||||||
|
|
@ -55,30 +56,44 @@ public class MultiSampleCompressor implements Compressor {
|
||||||
final int minBaseQual,
|
final int minBaseQual,
|
||||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||||
final int nContigs,
|
final int nContigs,
|
||||||
final boolean allowPolyploidReduction,
|
final boolean allowPolyploidReduction) {
|
||||||
final CompressionStash compressionStash) {
|
|
||||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||||
compressorsPerSample.put(name,
|
compressorsPerSample.put(name,
|
||||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash));
|
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public Set<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||||
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
String sampleName = read.getReadGroup().getSample();
|
||||||
String sample = read.getReadGroup().getSample();
|
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
|
||||||
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
|
|
||||||
if ( compressor == null )
|
if ( compressor == null )
|
||||||
throw new ReviewedStingException("No compressor for sample " + sample);
|
throw new ReviewedStingException("No compressor for sample " + sampleName);
|
||||||
return compressor.addAlignment(read);
|
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read);
|
||||||
|
Set<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||||
|
CompressionStash regions = readsAndStash.getSecond();
|
||||||
|
|
||||||
|
reads.addAll(closeVariantRegionsInAllSamples(regions));
|
||||||
|
|
||||||
|
return reads;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public Set<GATKSAMRecord> close() {
|
||||||
public Iterable<GATKSAMRecord> close() {
|
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||||
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
|
||||||
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
|
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close();
|
||||||
for ( GATKSAMRecord read : comp.close() )
|
reads = readsAndStash.getFirst();
|
||||||
reads.add(read);
|
}
|
||||||
|
return reads;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<GATKSAMRecord> closeVariantRegionsInAllSamples(CompressionStash regions) {
|
||||||
|
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||||
|
if (!regions.isEmpty()) {
|
||||||
|
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
|
||||||
|
reads.addAll(sample.closeVariantRegions(regions));
|
||||||
|
}
|
||||||
|
}
|
||||||
return reads;
|
return reads;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -330,7 +330,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public ReduceReadsStash reduceInit() {
|
public ReduceReadsStash reduceInit() {
|
||||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash));
|
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,10 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -10,7 +12,7 @@ import java.util.TreeSet;
|
||||||
* @author carneiro, depristo
|
* @author carneiro, depristo
|
||||||
* @version 3.0
|
* @version 3.0
|
||||||
*/
|
*/
|
||||||
public class SingleSampleCompressor implements Compressor {
|
public class SingleSampleCompressor {
|
||||||
final private int contextSize;
|
final private int contextSize;
|
||||||
final private int downsampleCoverage;
|
final private int downsampleCoverage;
|
||||||
final private int minMappingQuality;
|
final private int minMappingQuality;
|
||||||
|
|
@ -20,11 +22,11 @@ public class SingleSampleCompressor implements Compressor {
|
||||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||||
final private int nContigs;
|
final private int nContigs;
|
||||||
final private boolean allowPolyploidReduction;
|
final private boolean allowPolyploidReduction;
|
||||||
final CompressionStash compressionStash;
|
|
||||||
|
|
||||||
private SlidingWindow slidingWindow;
|
private SlidingWindow slidingWindow;
|
||||||
private int slidingWindowCounter;
|
private int slidingWindowCounter;
|
||||||
|
|
||||||
|
public static Pair<Set<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<Set<GATKSAMRecord>,CompressionStash>(new TreeSet<GATKSAMRecord>(), new CompressionStash());
|
||||||
|
|
||||||
public SingleSampleCompressor(final int contextSize,
|
public SingleSampleCompressor(final int contextSize,
|
||||||
final int downsampleCoverage,
|
final int downsampleCoverage,
|
||||||
|
|
@ -34,8 +36,7 @@ public class SingleSampleCompressor implements Compressor {
|
||||||
final int minBaseQual,
|
final int minBaseQual,
|
||||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||||
final int nContigs,
|
final int nContigs,
|
||||||
final boolean allowPolyploidReduction,
|
final boolean allowPolyploidReduction) {
|
||||||
final CompressionStash compressionStash) {
|
|
||||||
this.contextSize = contextSize;
|
this.contextSize = contextSize;
|
||||||
this.downsampleCoverage = downsampleCoverage;
|
this.downsampleCoverage = downsampleCoverage;
|
||||||
this.minMappingQuality = minMappingQuality;
|
this.minMappingQuality = minMappingQuality;
|
||||||
|
|
@ -46,15 +47,11 @@ public class SingleSampleCompressor implements Compressor {
|
||||||
this.downsampleStrategy = downsampleStrategy;
|
this.downsampleStrategy = downsampleStrategy;
|
||||||
this.nContigs = nContigs;
|
this.nContigs = nContigs;
|
||||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||||
this.compressionStash = compressionStash;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public Pair<Set<GATKSAMRecord>, CompressionStash> addAlignment( GATKSAMRecord read ) {
|
||||||
* @{inheritDoc}
|
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||||
*/
|
CompressionStash stash = new CompressionStash();
|
||||||
@Override
|
|
||||||
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
|
|
||||||
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
|
||||||
int readOriginalStart = read.getUnclippedStart();
|
int readOriginalStart = read.getUnclippedStart();
|
||||||
|
|
||||||
// create a new window if:
|
// create a new window if:
|
||||||
|
|
@ -63,22 +60,27 @@ public class SingleSampleCompressor implements Compressor {
|
||||||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||||
|
|
||||||
// close the current sliding window
|
// close the current sliding window
|
||||||
result.addAll(slidingWindow.close());
|
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close();
|
||||||
|
reads = readsAndStash.getFirst();
|
||||||
|
stash = readsAndStash.getSecond();
|
||||||
slidingWindow = null; // so we create a new one on the next if
|
slidingWindow = null; // so we create a new one on the next if
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( slidingWindow == null) { // this is the first read
|
if ( slidingWindow == null) { // this is the first read
|
||||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash);
|
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
|
||||||
slidingWindowCounter++;
|
slidingWindowCounter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.addAll(slidingWindow.addRead(read));
|
stash.addAll(slidingWindow.addRead(read));
|
||||||
return result;
|
return new Pair<Set<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
|
||||||
public Iterable<GATKSAMRecord> close() {
|
return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
|
||||||
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
|
}
|
||||||
|
|
||||||
|
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||||
|
return slidingWindow.closeVariantRegions(regions);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,8 +6,10 @@ import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
import net.sf.samtools.SAMFileHeader;
|
import net.sf.samtools.SAMFileHeader;
|
||||||
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||||
|
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
@ -55,7 +57,8 @@ public class SlidingWindow {
|
||||||
private final int nContigs;
|
private final int nContigs;
|
||||||
|
|
||||||
private boolean allowPolyploidReductionInGeneral;
|
private boolean allowPolyploidReductionInGeneral;
|
||||||
private CompressionStash compressionStash;
|
|
||||||
|
private static CompressionStash emptyRegions = new CompressionStash();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||||
|
|
@ -87,7 +90,7 @@ public class SlidingWindow {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) {
|
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
|
||||||
this.contextSize = contextSize;
|
this.contextSize = contextSize;
|
||||||
this.downsampleCoverage = downsampleCoverage;
|
this.downsampleCoverage = downsampleCoverage;
|
||||||
|
|
||||||
|
|
@ -124,7 +127,6 @@ public class SlidingWindow {
|
||||||
this.nContigs = nContigs;
|
this.nContigs = nContigs;
|
||||||
|
|
||||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||||
this.compressionStash = compressionStash;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -138,7 +140,7 @@ public class SlidingWindow {
|
||||||
* @param read the read
|
* @param read the read
|
||||||
* @return a list of reads that have been finished by sliding the window.
|
* @return a list of reads that have been finished by sliding the window.
|
||||||
*/
|
*/
|
||||||
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
|
public CompressionStash addRead(GATKSAMRecord read) {
|
||||||
addToHeader(windowHeader, read); // update the window header counts
|
addToHeader(windowHeader, read); // update the window header counts
|
||||||
readsInWindow.add(read); // add read to sliding reads
|
readsInWindow.add(read); // add read to sliding reads
|
||||||
return slideWindow(read.getUnclippedStart());
|
return slideWindow(read.getUnclippedStart());
|
||||||
|
|
@ -152,8 +154,9 @@ public class SlidingWindow {
|
||||||
* @param variantSite boolean array with true marking variant regions
|
* @param variantSite boolean array with true marking variant regions
|
||||||
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
||||||
*/
|
*/
|
||||||
private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) {
|
||||||
boolean foundStart = false;
|
boolean foundStart = false;
|
||||||
|
final int windowHeaderStart = getStartLocation(windowHeader);
|
||||||
int variantRegionStartIndex = 0;
|
int variantRegionStartIndex = 0;
|
||||||
for (int i=from; i<to; i++) {
|
for (int i=from; i<to; i++) {
|
||||||
if (variantSite[i] && !foundStart) {
|
if (variantSite[i] && !foundStart) {
|
||||||
|
|
@ -161,10 +164,12 @@ public class SlidingWindow {
|
||||||
foundStart = true;
|
foundStart = true;
|
||||||
}
|
}
|
||||||
else if(!variantSite[i] && foundStart) {
|
else if(!variantSite[i] && foundStart) {
|
||||||
return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true));
|
return(new SimpleGenomeLoc(contig, contigIndex, windowHeaderStart + variantRegionStartIndex, windowHeaderStart + i - 1, true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null;
|
final int refStart = windowHeaderStart + variantRegionStartIndex;
|
||||||
|
final int refStop = windowHeaderStart + to - 1;
|
||||||
|
return (foundStart && forceClose) ? new SimpleGenomeLoc(contig, contigIndex, refStart, refStop, true) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -173,19 +178,20 @@ public class SlidingWindow {
|
||||||
* @param from beginning window header index of the search window (inclusive)
|
* @param from beginning window header index of the search window (inclusive)
|
||||||
* @param to end window header index of the search window (exclusive)
|
* @param to end window header index of the search window (exclusive)
|
||||||
* @param variantSite boolean array with true marking variant regions
|
* @param variantSite boolean array with true marking variant regions
|
||||||
* @return a list with start/stops of variant regions following getNextVariantRegion description
|
* @return a list with start/stops of variant regions following findNextVariantRegion description
|
||||||
*/
|
*/
|
||||||
private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) {
|
private CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean forceClose) {
|
||||||
CompressionStash regions = new CompressionStash();
|
CompressionStash regions = new CompressionStash();
|
||||||
int index = from;
|
int index = from;
|
||||||
while(index < to) {
|
while(index < to) {
|
||||||
SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite);
|
SimpleGenomeLoc result = findNextVariantRegion(index, to, variantSite, forceClose);
|
||||||
if (result == null)
|
if (result == null)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
regions.add(result);
|
regions.add(result);
|
||||||
if (result.getStop() < 0)
|
if (!result.isFinished())
|
||||||
break;
|
break;
|
||||||
|
|
||||||
index = result.getStop() + 1;
|
index = result.getStop() + 1;
|
||||||
}
|
}
|
||||||
return regions;
|
return regions;
|
||||||
|
|
@ -201,25 +207,25 @@ public class SlidingWindow {
|
||||||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||||
*/
|
*/
|
||||||
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
|
protected CompressionStash slideWindow(final int incomingReadUnclippedStart) {
|
||||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
|
||||||
|
|
||||||
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||||
|
CompressionStash regions = emptyRegions;
|
||||||
|
boolean forceClose = true;
|
||||||
|
|
||||||
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
|
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
|
||||||
markSites(incomingReadUnclippedStart);
|
markSites(incomingReadUnclippedStart);
|
||||||
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
|
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
|
||||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||||
|
|
||||||
CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet());
|
regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose);
|
||||||
finalizedReads = closeVariantRegions(regions, false);
|
|
||||||
|
|
||||||
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
|
||||||
readsInWindow.pollFirst();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return finalizedReads;
|
// todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions
|
||||||
|
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||||
|
readsInWindow.pollFirst();
|
||||||
|
}
|
||||||
|
|
||||||
|
return regions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -623,31 +629,27 @@ public class SlidingWindow {
|
||||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||||
|
|
||||||
return result; // finalized reads will be downsampled if necessary
|
return result; // finalized reads will be downsampled if necessary
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||||
private List<GATKSAMRecord> closeVariantRegions(CompressionStash regions, boolean forceClose) {
|
TreeSet<GATKSAMRecord> allReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
|
||||||
if (!regions.isEmpty()) {
|
if (!regions.isEmpty()) {
|
||||||
int lastStop = -1;
|
int lastStop = -1;
|
||||||
|
int windowHeaderStart = getStartLocation(windowHeader);
|
||||||
|
|
||||||
for (SimpleGenomeLoc region : regions) {
|
for (SimpleGenomeLoc region : regions) {
|
||||||
int start = region.getStart();
|
if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) {
|
||||||
int stop = region.getStop();
|
int start = region.getStart() - windowHeaderStart;
|
||||||
|
int stop = region.getStop() - windowHeaderStart;
|
||||||
|
|
||||||
if (!region.isFinished()) {
|
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track
|
||||||
if(forceClose) // region is unfinished but we're forcing the close of this window
|
lastStop = stop;
|
||||||
stop = windowHeader.size() - 1;
|
|
||||||
else
|
|
||||||
continue; // region is unfinished and we're not forcing the close of this window
|
|
||||||
}
|
}
|
||||||
|
|
||||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
|
||||||
lastStop = stop;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
windowHeader.remove();
|
||||||
}
|
}
|
||||||
return allReads;
|
return allReads;
|
||||||
}
|
}
|
||||||
|
|
@ -681,23 +683,24 @@ public class SlidingWindow {
|
||||||
*
|
*
|
||||||
* @return All reads generated
|
* @return All reads generated
|
||||||
*/
|
*/
|
||||||
public List<GATKSAMRecord> close() {
|
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
|
||||||
// mark variant regions
|
// mark variant regions
|
||||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
Set<GATKSAMRecord> finalizedReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||||
|
CompressionStash regions = new CompressionStash();
|
||||||
|
boolean forceCloseUnfinishedRegions = true;
|
||||||
|
|
||||||
if (!windowHeader.isEmpty()) {
|
if (!windowHeader.isEmpty()) {
|
||||||
markSites(getStopLocation(windowHeader) + 1);
|
markSites(getStopLocation(windowHeader) + 1);
|
||||||
CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
|
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
|
||||||
finalizedReads = closeVariantRegions(regions, true);
|
finalizedReads = closeVariantRegions(regions);
|
||||||
|
|
||||||
if (!windowHeader.isEmpty()) {
|
if (!windowHeader.isEmpty()) {
|
||||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
|
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
|
||||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return finalizedReads;
|
return new Pair<Set<GATKSAMRecord>, CompressionStash>(finalizedReads, regions);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -26,23 +26,23 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testDefaultCompression() {
|
public void testDefaultCompression() {
|
||||||
RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1");
|
RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testMultipleIntervals() {
|
public void testMultipleIntervals() {
|
||||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||||
RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a");
|
RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testHighCompression() {
|
public void testHighCompression() {
|
||||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5");
|
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testLowCompression() {
|
public void testLowCompression() {
|
||||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae");
|
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
|
|
@ -83,7 +83,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void testCoReduction() {
|
public void testCoReduction() {
|
||||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("")));
|
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b")));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||||
|
|
||||||
|
import com.google.java.contract.Requires;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
|
import java.util.SortedSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
|
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
|
||||||
|
|
@ -27,4 +31,43 @@ public class SimpleGenomeLoc extends GenomeLoc {
|
||||||
public boolean isFinished() {
|
public boolean isFinished() {
|
||||||
return finished;
|
return finished;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Requires("a != null && b != null")
|
||||||
|
public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException {
|
||||||
|
if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) {
|
||||||
|
throw new ReviewedStingException("Tried to merge unmapped genome locs");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(a.contiguousP(b))) {
|
||||||
|
throw new ReviewedStingException("The two genome locs need to be contiguous");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return new SimpleGenomeLoc(a.getContig(), a.contigIndex,
|
||||||
|
Math.min(a.getStart(), b.getStart()),
|
||||||
|
Math.max(a.getStop(), b.getStop()),
|
||||||
|
a.isFinished());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merges a list of *sorted* *contiguous* locs into one
|
||||||
|
*
|
||||||
|
* @param sortedLocs a sorted list of contiguous locs
|
||||||
|
* @return one merged loc
|
||||||
|
*/
|
||||||
|
public static SimpleGenomeLoc merge(SortedSet<SimpleGenomeLoc> sortedLocs) {
|
||||||
|
SimpleGenomeLoc previousLoc = null;
|
||||||
|
for (SimpleGenomeLoc loc : sortedLocs) {
|
||||||
|
if (loc.isUnmapped()) {
|
||||||
|
throw new ReviewedStingException("Tried to merge unmapped genome locs");
|
||||||
|
}
|
||||||
|
if (previousLoc != null && !previousLoc.contiguousP(loc)) {
|
||||||
|
throw new ReviewedStingException("The genome locs need to be contiguous");
|
||||||
|
}
|
||||||
|
previousLoc = loc;
|
||||||
|
}
|
||||||
|
SimpleGenomeLoc firstLoc = sortedLocs.first();
|
||||||
|
SimpleGenomeLoc lastLoc = sortedLocs.last();
|
||||||
|
return merge(firstLoc, lastLoc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue