More RR-related updates and tests.

- ReduceReads by default now sets up-front ReadWalker downsampling to 40x per start position.
   - This is the value I used in my tests with Picard to show that memory issues pretty much disappeared.
   - This should hopefully take care of the memory issues being reported on the forum.
- Added javadocs to SlidingWindow (the main RR class) to follow GATK conventions.
- Added more unit tests to increase coverage of BaseCounts class.
- Added more unit tests to test I/D operators in the SlidingWindow class.
This commit is contained in:
Eric Banks 2013-02-04 12:57:43 -05:00
parent 971ded341b
commit 2d518f3063
4 changed files with 167 additions and 59 deletions

View File

@ -56,13 +56,11 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
@ -107,6 +105,7 @@ import java.util.*;
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.CONTIG)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40)
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
@Output

View File

@ -198,8 +198,10 @@ public class SlidingWindow {
* sliding process.
*
* @param read the read
* @return a list of reads that have been finished by sliding the window.
* @return a non-null list of reads (in the CompressionStash) that have been finished by sliding the window.
*/
@Requires({"read != null"})
@Ensures("result != null")
public CompressionStash addRead(GATKSAMRecord read) {
addToHeader(windowHeader, read); // update the window header counts
readsInWindow.add(read); // add read to sliding reads
@ -210,8 +212,8 @@ public class SlidingWindow {
* Returns the next complete (or incomplete if closeLastRegion is true) variant region between 'from' (inclusive) and 'to' (exclusive)
* but converted to global coordinates.
*
* @param from beginning window header index of the search window (inclusive); note that this uses local coordinates
* @param to end window header index of the search window (exclusive); note that this uses local coordinates
* @param from beginning window header index of the search window (inclusive) in local (to the windowHeader) coordinates
* @param to end window header index of the search window (exclusive) in local (to the windowHeader) coordinates
* @param variantSite boolean array with true marking variant regions
* @param closeLastRegion if the last index is variant (so it's an incomplete region), should we close (and return as an interval) the location or ignore it?
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. All coordinates returned are global.
@ -238,8 +240,8 @@ public class SlidingWindow {
/**
* Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive)
*
* @param from beginning window header index of the search window (inclusive); note that this uses local coordinates
* @param to end window header index of the search window (exclusive); note that this uses local coordinates
* @param from beginning window header index of the search window (inclusive) in local (to the windowHeader) coordinates
* @param to end window header index of the search window (exclusive) in local (to the windowHeader) coordinates
* @param variantSite boolean array with true marking variant regions
* @return a list with start/stops of variant regions following findNextVariantRegion description in global coordinates
*/
@ -395,10 +397,14 @@ public class SlidingWindow {
*
* If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
*
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
* @param header the window header
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
* @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated.
*/
@Requires({"start >= 0 && (end >= start || end == 0)"})
@Ensures("result != null")
protected List<GATKSAMRecord> addToSyntheticReads(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
if (start < end) {
@ -450,7 +456,7 @@ public class SlidingWindow {
* Finalizes one or more synthetic reads.
*
* @param type the synthetic reads you want to close
* @return the GATKSAMRecords generated by finalizing the synthetic reads
* @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads
*/
private List<GATKSAMRecord> finalizeAndAdd(ConsensusType type) {
GATKSAMRecord read = null;
@ -479,7 +485,7 @@ public class SlidingWindow {
*
* @param start beginning of the filtered region
* @param upTo limit to search for another consensus element
* @return next position with consensus data or empty
* @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position
*/
private int findNextNonConsensusElement(LinkedList<HeaderElement> header, int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
@ -501,7 +507,7 @@ public class SlidingWindow {
*
* @param start beginning of the region
* @param upTo limit to search for
* @return next position with no filtered data
* @return next position in local coordinates (relative to the windowHeader) with no filtered data; otherwise, the start position
*/
private int findNextNonFilteredDataElement(LinkedList<HeaderElement> header, int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
@ -523,7 +529,7 @@ public class SlidingWindow {
*
* @param start beginning of the region
* @param upTo limit to search for
* @return next position with non-empty element
* @return next position in local coordinates (relative to the windowHeader) with non-empty element; otherwise, the start position
*/
private int findNextNonEmptyElement(LinkedList<HeaderElement> header, int start, int upTo) {
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
@ -544,12 +550,16 @@ public class SlidingWindow {
/**
* Adds bases to the filtered data synthetic read.
*
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
* bases.
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData bases.
*
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @param header the window header
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
* @return a non-null list of GATKSAMRecords representing finalized filtered consensus data. Empty list if no consensus was generated.
*/
@Requires({"start >= 0 && (end >= start || end == 0)"})
@Ensures("result != null")
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
@ -585,9 +595,12 @@ public class SlidingWindow {
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
* bases.
*
* @param header the window header
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
* @param isNegativeStrand should the synthetic read be represented as being on the negative strand?
*/
@Requires({"start >= 0 && (end >= start || end == 0)"})
private void addToRunningConsensus(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
if (runningConsensus == null)
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
@ -621,6 +634,16 @@ public class SlidingWindow {
syntheticRead.add(base, count, qual, insQual, delQual, rms);
}
/**
* Method to compress a variant region and return the associated reduced reads
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
* @return a non-null list of all reads contained in the variant region
*/
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
@ -684,11 +707,13 @@ public class SlidingWindow {
/**
* Finalizes a variant region, any adjacent synthetic reads.
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @return all reads contained in the variant region plus any adjacent synthetic reads
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @param disallowPolyploidReductionAtThisPosition should we disallow polyploid (het) compression here?
* @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads
*/
@Requires("start <= stop")
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
@ -733,9 +758,11 @@ public class SlidingWindow {
*
* It will use the downsampling strategy defined by the SlidingWindow
*
* @param allReads the reads to select from (all reads that cover the window)
* @return a list of reads selected by the downsampler to cover the window to at least the desired coverage
* @param allReads a non-null list of reads to select from (all reads that cover the window)
* @return a non-null list of reads selected by the downsampler to cover the window to at least the desired coverage
*/
@Requires({"allReads != null"})
@Ensures("result != null")
protected List<GATKSAMRecord> downsampleVariantRegion(final List<GATKSAMRecord> allReads) {
int nReads = allReads.size();
if (nReads == 0)
@ -755,8 +782,9 @@ public class SlidingWindow {
* regions that still exist regardless of being able to fulfill the
* context size requirement in the end.
*
* @return All reads generated
* @return A non-null set/list of all reads generated
*/
@Ensures("result != null")
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
// mark variant regions
Set<GATKSAMRecord> finalizedReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
@ -780,7 +808,7 @@ public class SlidingWindow {
/**
* generates the SAM record for the running consensus read and resets it (to null)
*
* @return the read contained in the running consensus
* @return the read contained in the running consensus or null
*/
protected GATKSAMRecord finalizeRunningConsensus() {
GATKSAMRecord finalizedRead = null;
@ -798,7 +826,7 @@ public class SlidingWindow {
/**
* generates the SAM record for the filtered data consensus and resets it (to null)
*
* @return the read contained in the running consensus
* @return the read contained in the running consensus or null
*/
protected GATKSAMRecord finalizeFilteredDataConsensus() {
GATKSAMRecord finalizedRead = null;
@ -813,9 +841,19 @@ public class SlidingWindow {
return finalizedRead;
}
private List<GATKSAMRecord> createPolyploidConsensus(int start, int stop, int nHaplotypes, int hetRefPosition) {
/**
* Finalizes a variant region, any adjacent synthetic reads.
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @param nHaplotypes the number of haplotypes to use
* @param hetRefPosition reference position (in global coordinates) of the het site
* @return a non-null list of all reads contained in the variant region as a polyploid consensus
*/
// TODO -- Why do we need the nHaplotypes argument? It is not enforced at all... [EB]
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
private List<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int nHaplotypes, final int hetRefPosition) {
// we will create two (positive strand, negative strand) headers for each contig
List<LinkedList<HeaderElement>> headersPosStrand = new ArrayList<LinkedList<HeaderElement>>();
List<LinkedList<HeaderElement>> headersNegStrand = new ArrayList<LinkedList<HeaderElement>>();
@ -902,7 +940,7 @@ public class SlidingWindow {
* @param read the incoming read to be added to the sliding window
* @param removeRead if we are removing the read from the header or adding
*/
private void updateHeaderCounts(LinkedList<HeaderElement> header, GATKSAMRecord read, boolean removeRead) {
private void updateHeaderCounts(final LinkedList<HeaderElement> header, final GATKSAMRecord read, final boolean removeRead) {
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
byte[] insQuals = read.getExistingBaseInsertionQualities();

View File

@ -47,9 +47,6 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
@ -62,12 +59,12 @@ import java.util.List;
* Basic unit test for BaseCounts in reduced reads
*/
public class BaseCountsUnitTest extends BaseTest {
private class SingleTest {
private class BaseCountsTest {
public String bases;
public byte mostCountBase;
public int mostCommonCount;
private SingleTest(String bases, char mostCountBase, int mostCommonCount) {
private BaseCountsTest(String bases, char mostCountBase, int mostCommonCount) {
this.mostCommonCount = mostCommonCount;
this.mostCountBase = (byte)mostCountBase;
this.bases = bases;
@ -77,30 +74,28 @@ public class BaseCountsUnitTest extends BaseTest {
@DataProvider(name = "data")
public Object[][] createData1() {
List<SingleTest> params = new ArrayList<SingleTest>();
List<BaseCountsTest> params = new ArrayList<BaseCountsTest>();
params.add(new SingleTest("A", 'A', 1 ));
params.add(new SingleTest("AA", 'A', 2 ));
params.add(new SingleTest("AC", 'A', 1 ));
params.add(new SingleTest("AAC", 'A', 2 ));
params.add(new SingleTest("AAA", 'A', 3 ));
params.add(new SingleTest("AAAN", 'A', 3 ));
params.add(new SingleTest("AAANNNN", 'N', 4 ));
params.add(new SingleTest("AACTG", 'A', 2 ));
params.add(new SingleTest("D", 'D', 1 ));
params.add(new SingleTest("DDAAD", 'D', 3));
params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
params.add(new SingleTest("AAIIIAI", 'I', 4 ));
params.add(new BaseCountsTest("A", 'A', 1 ));
params.add(new BaseCountsTest("AA", 'A', 2 ));
params.add(new BaseCountsTest("AC", 'A', 1 ));
params.add(new BaseCountsTest("AAC", 'A', 2 ));
params.add(new BaseCountsTest("AAA", 'A', 3 ));
params.add(new BaseCountsTest("AAAN", 'A', 3 ));
params.add(new BaseCountsTest("AAANNNN", 'N', 4 ));
params.add(new BaseCountsTest("AACTG", 'A', 2 ));
params.add(new BaseCountsTest("D", 'D', 1 ));
params.add(new BaseCountsTest("DDAAD", 'D', 3));
params.add(new BaseCountsTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
params.add(new BaseCountsTest("AAIIIAI", 'I', 4 ));
List<Object[]> params2 = new ArrayList<Object[]>();
for ( SingleTest x : params ) params2.add(new Object[]{x});
for ( BaseCountsTest x : params ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
@Test(dataProvider = "data", enabled = true)
public void testCounting(SingleTest params) {
public void testCounting(BaseCountsTest params) {
BaseCounts counts = new BaseCounts();
for ( byte base : params.bases.getBytes() )
@ -110,5 +105,44 @@ public class BaseCountsUnitTest extends BaseTest {
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
// test the static creation
final int[] countsArray = new int[] { counts.countOfBase(BaseIndex.A), counts.countOfBase(BaseIndex.C),
counts.countOfBase(BaseIndex.G), counts.countOfBase(BaseIndex.T)};
final BaseCounts countsFromArray = BaseCounts.createWithCounts(countsArray);
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A));
Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C));
Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G));
Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T));
Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount());
// test addition
counts.add(countsFromArray);
Assert.assertEquals(counts.countOfBase(BaseIndex.A), 2 * countsFromArray.countOfBase(BaseIndex.A));
Assert.assertEquals(counts.countOfBase(BaseIndex.C), 2 * countsFromArray.countOfBase(BaseIndex.C));
Assert.assertEquals(counts.countOfBase(BaseIndex.G), 2 * countsFromArray.countOfBase(BaseIndex.G));
Assert.assertEquals(counts.countOfBase(BaseIndex.T), 2 * countsFromArray.countOfBase(BaseIndex.T));
Assert.assertEquals(ACGTcounts(counts), 2 * countsFromArray.totalCount());
// test subtraction
counts.sub(countsFromArray);
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A));
Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C));
Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G));
Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T));
Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount());
// test decrementing
if ( counts.countOfBase(BaseIndex.A) > 0 ) {
counts.decr((byte)'A');
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A) - 1);
}
}
private static int ACGTcounts(final BaseCounts baseCounts) {
return baseCounts.totalCountWithoutIndels() - baseCounts.countOfBase(BaseIndex.N);
}
}

View File

@ -47,6 +47,9 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
@ -258,24 +261,48 @@ public class SlidingWindowUnitTest extends BaseTest {
// then add the permuted reads
for ( final GenomeLoc loc : locs )
myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality));
myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M));
}
private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality) {
private ConsensusCreationTest(final List<GenomeLoc> locs, final CigarOperator operator, final int expectedNumberOfReads) {
this.expectedNumberOfReads = expectedNumberOfReads;
// first, add the basic reads to the collection
myReads.addAll(basicReads);
// then add the permuted reads
for ( final GenomeLoc loc : locs )
myReads.add(createVariantRead(loc, false, false, operator));
}
private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality,
final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) {
final int startPos = loc.getStart() - 50;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead" + startPos, 0, startPos, readLength);
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
// create a mismatch
bases[50] = 'C';
// create a mismatch if requested
if ( operator == CigarOperator.M )
bases[50] = 'C';
read.setReadBases(bases);
final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength);
if ( variantBaseShouldBeLowQuality )
baseQuals[50] = (byte)10;
read.setBaseQualities(baseQuals);
final byte mappingQual = readShouldBeLowQuality ? (byte)10 : (byte)30;
read.setMappingQuality(mappingQual);
if ( operator != CigarOperator.M ) {
final List<CigarElement> elements = new ArrayList<CigarElement>(3);
elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 51, CigarOperator.M));
elements.add(new CigarElement(1, operator));
elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 48, CigarOperator.M));
read.setCigar(new Cigar(elements));
}
return read;
}
}
@ -315,6 +342,16 @@ public class SlidingWindowUnitTest extends BaseTest {
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), true, false, 2)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), false, true, 3)});
// test I/D operators
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.D, 9)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), CigarOperator.D, 10)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), CigarOperator.D, 10)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), CigarOperator.D, 11)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.I, 9)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), CigarOperator.I, 10)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), CigarOperator.I, 10)});
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), CigarOperator.I, 11)});
return tests.toArray(new Object[][]{});
}