More RR tests and fixes.

* Fixed implementation of polyploid (het) compression in RR.
  * The test for a usable site was all wrong.  Worked out details with Mauricio to get it right.
  * Added comprehensive unit tests in HeaderElement class to make sure this is done right.
  * Still need to add tests for the actual polyploid compression.
  * No longer allow non-diploid het compression; I don't want to test/handle it, do you?
* Added nearly full coverage of tests for the BaseCounts class.
This commit is contained in:
Eric Banks 2013-02-04 15:55:15 -05:00
parent a281fa6548
commit 70f3997a38
9 changed files with 170 additions and 76 deletions

View File

@ -143,7 +143,7 @@ import com.google.java.contract.Requires;
@Ensures("result >= 0")
public byte averageQuals(final byte base) {
return (byte) (getSumQuals(base) / countOfBase(base));
return averageQuals(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
@ -232,12 +232,6 @@ import com.google.java.contract.Requires;
return maxI;
}
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
final int targetCount = counts[targetIndex.index];
final int testCount = counts[testIndex.index];
return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) );
}
public byte baseWithMostProbability() {
return baseIndexWithMostProbability().getByte();
}

View File

@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.LinkedList;
/**
@ -268,24 +267,26 @@ public class HeaderElement {
* Calculates the number of haplotypes necessary to represent this site.
*
* @param minVariantProportion the minimum proportion to call a site variant.
* @return the number of haplotypes necessary to represent this site.
* @return the number of alleles necessary to represent this site.
*/
public int getNumberOfHaplotypes(double minVariantProportion) {
int nHaplotypes = 0;
int totalCount = consensusBaseCounts.totalCount();
int runningCount = 0;
if (totalCount == 0)
public int getNumberOfAlleles(final double minVariantProportion) {
final int totalBaseCount = consensusBaseCounts.totalCount();
if (totalBaseCount == 0)
return 0;
int[] countsArray = consensusBaseCounts.countsArray();
Arrays.sort(countsArray);
for (int i = countsArray.length-1; i>=0; i--) {
nHaplotypes++;
runningCount += countsArray[i];
if (runningCount/totalCount > minVariantProportion)
break;
final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount);
int nAlleles = 0;
for ( BaseIndex base : BaseIndex.values() ) {
final int baseCount = consensusBaseCounts.countOfBase(base);
// don't consider this allele if the count is 0
if ( baseCount == 0 )
continue;
if ( baseCount >= minBaseCountForRelevantAlleles )
nAlleles++;
}
return nHaplotypes;
return nAlleles;
}
}

View File

@ -101,12 +101,11 @@ public class MultiSampleCompressor {
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy,
final int nContigs,
final boolean allowPolyploidReduction) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleCompressor(contextSize, downsampleCoverage,
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction));
}
}

View File

@ -212,14 +212,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
private int downsampleCoverage = 250;
/**
* Number of chromossomes in the sample (this is used for the polyploid consensus compression). Only
* tested for humans (or organisms with n=2). Use at your own risk!
*/
@Hidden
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
private int nContigs = 2;
@Hidden
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
private boolean nwayout = false;
@ -371,7 +363,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
*/
@Override
public ReduceReadsStash reduceInit() {
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION));
}
/**

View File

@ -67,7 +67,6 @@ public class SingleSampleCompressor {
final private double minIndelProportionToTriggerVariant;
final private int minBaseQual;
final private ReduceReads.DownsampleStrategy downsampleStrategy;
final private int nContigs;
final private boolean allowPolyploidReduction;
private SlidingWindow slidingWindow;
@ -82,7 +81,6 @@ public class SingleSampleCompressor {
final double minIndelProportionToTriggerVariant,
final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy,
final int nContigs,
final boolean allowPolyploidReduction) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
@ -92,7 +90,6 @@ public class SingleSampleCompressor {
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
this.minBaseQual = minBaseQual;
this.downsampleStrategy = downsampleStrategy;
this.nContigs = nContigs;
this.allowPolyploidReduction = allowPolyploidReduction;
}
@ -114,7 +111,7 @@ public class SingleSampleCompressor {
}
if ( slidingWindow == null) { // this is the first read
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction);
slidingWindowCounter++;
}

View File

@ -102,8 +102,6 @@ public class SlidingWindow {
protected ReduceReads.DownsampleStrategy downsampleStrategy;
private boolean hasIndelQualities;
private final int nContigs;
private boolean allowPolyploidReductionInGeneral;
private static CompressionStash emptyRegions = new CompressionStash();
@ -143,14 +141,13 @@ public class SlidingWindow {
this.contigIndex = contigIndex;
contextSize = 10;
nContigs = 1;
this.windowHeader = new LinkedList<HeaderElement>();
windowHeader.addFirst(new HeaderElement(startLocation));
this.readsInWindow = new TreeSet<GATKSAMRecord>();
}
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, boolean allowPolyploidReduction) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
@ -184,7 +181,6 @@ public class SlidingWindow {
this.downsampleStrategy = downsampleStrategy;
this.hasIndelQualities = hasIndelQualities;
this.nContigs = nContigs;
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
}
@ -644,43 +640,43 @@ public class SlidingWindow {
*/
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
protected List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
// Try to compress into a polyploid consensus
int nHaplotypes = 0;
int nVariantPositions = 0;
int hetRefPosition = -1;
boolean canCompress = true;
boolean foundEvent = false;
Object[] header = windowHeader.toArray();
// foundEvent will remain false if we don't allow polyploid reduction
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
for (int i = start; i<=stop; i++) {
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
if (nHaplotypes > nContigs) {
int nAlleles = ((HeaderElement) header[i]).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
// we will only work on diploid cases because we just don't want to handle/test other scenarios
if ( nAlleles > 2 ) {
canCompress = false;
break;
} else if ( nAlleles == 2 ) {
nVariantPositions++;
}
// guarantees that there is only 1 site in the variant region that needs more than one haplotype
if (nHaplotypes > 1) {
if (!foundEvent) {
foundEvent = true;
hetRefPosition = i;
}
else {
canCompress = false;
break;
}
// make sure that there is only 1 site in the variant region that contains more than one allele
if ( nVariantPositions == 1 ) {
hetRefPosition = i;
} else if ( nVariantPositions > 1 ) {
canCompress = false;
break;
}
}
}
// Try to compress the variant region
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
if (canCompress && foundEvent) {
allReads = createPolyploidConsensus(start, stop, nHaplotypes, ((HeaderElement) header[hetRefPosition]).getLocation());
// Try to compress the variant region; note that using the hetRefPosition protects us from trying to compress
// variant regions that are created by insertions (since we can't confirm here that they represent the same allele)
if ( canCompress && hetRefPosition != -1 ) {
allReads = createPolyploidConsensus(start, stop, ((HeaderElement) header[hetRefPosition]).getLocation());
}
// Return all reads that overlap the variant region and remove them from the window header entirely
@ -846,19 +842,17 @@ public class SlidingWindow {
*
* @param start the first window header index in the variant region (inclusive)
* @param stop the last window header index of the variant region (inclusive)
* @param nHaplotypes the number of haplotypes to use
* @param hetRefPosition reference position (in global coordinates) of the het site
* @return a non-null list of all reads contained in the variant region as a polyploid consensus
*/
// TODO -- Why do we need the nHaplotypes argument? It is not enforced at all... [EB]
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
@Ensures("result != null")
private List<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int nHaplotypes, final int hetRefPosition) {
private List<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
// we will create two (positive strand, negative strand) headers for each contig
List<LinkedList<HeaderElement>> headersPosStrand = new ArrayList<LinkedList<HeaderElement>>();
List<LinkedList<HeaderElement>> headersNegStrand = new ArrayList<LinkedList<HeaderElement>>();
List<GATKSAMRecord> hetReads = new LinkedList<GATKSAMRecord>();
Map<Byte, Integer> haplotypeHeaderMap = new HashMap<Byte, Integer>(nHaplotypes);
Map<Byte, Integer> haplotypeHeaderMap = new HashMap<Byte, Integer>(2);
int currentHaplotype = 0;
int refStart = windowHeader.get(start).getLocation();
int refStop = windowHeader.get(stop).getLocation();

View File

@ -53,12 +53,14 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Basic unit test for BaseCounts in reduced reads
*/
public class BaseCountsUnitTest extends BaseTest {
private class BaseCountsTest {
public String bases;
public byte mostCountBase;
@ -71,9 +73,8 @@ public class BaseCountsUnitTest extends BaseTest {
}
}
@DataProvider(name = "data")
public Object[][] createData1() {
@DataProvider(name = "counting")
public Object[][] createCountingData() {
List<BaseCountsTest> params = new ArrayList<BaseCountsTest>();
params.add(new BaseCountsTest("A", 'A', 1 ));
@ -94,7 +95,7 @@ public class BaseCountsUnitTest extends BaseTest {
return params2.toArray(new Object[][]{});
}
@Test(dataProvider = "data", enabled = true)
@Test(dataProvider = "counting", enabled = true)
public void testCounting(BaseCountsTest params) {
BaseCounts counts = new BaseCounts();
@ -137,12 +138,64 @@ public class BaseCountsUnitTest extends BaseTest {
counts.decr((byte)'A');
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A) - 1);
}
}
private static int ACGTcounts(final BaseCounts baseCounts) {
return baseCounts.totalCountWithoutIndels() - baseCounts.countOfBase(BaseIndex.N);
}
//////////////////////////////////
// TEST FOR QUALS IN BASECOUNTS //
//////////////////////////////////
private class BaseCountsQualsTest {
public final List<Integer> quals;
private BaseCountsQualsTest(final List<Integer> quals) {
this.quals = quals;
}
}
@DataProvider(name = "quals")
public Object[][] createQualsData() {
List<Object[]> tests = new ArrayList<Object[]>();
final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 };
for ( final int qual1 : quals ) {
for ( final int qual2 : quals ) {
for ( final int qual3 : quals ) {
tests.add(new Object[]{new BaseCountsQualsTest(Arrays.asList(qual1, qual2, qual3))});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "quals", enabled = true)
public void testQuals(BaseCountsQualsTest test) {
BaseCounts counts = new BaseCounts();
for ( int qual : test.quals )
counts.incr(BaseIndex.A, (byte)qual);
final int actualSum = (int)counts.getSumQuals((byte)'A');
final int expectedSum = qualSum(test.quals);
Assert.assertEquals(actualSum, expectedSum);
final int actualAverage = (int)counts.averageQuals((byte)'A');
Assert.assertEquals(actualAverage, expectedSum / test.quals.size());
// test both proportion methods
Assert.assertEquals(counts.baseCountProportion(BaseIndex.A), counts.baseCountProportion((byte)'A'));
}
private static int qualSum(final List<Integer> quals) {
int sum = 0;
for ( final int qual : quals )
sum += qual;
return sum;
}
}

View File

@ -131,4 +131,69 @@ public class HeaderElementUnitTest extends BaseTest {
Assert.assertFalse(headerElement.isVariantFromMismatches(0.05));
Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip);
}
private class AllelesTest {
public final int[] counts;
public final double proportion;
private AllelesTest(final int[] counts, final double proportion) {
this.counts = counts;
this.proportion = proportion;
}
}
@DataProvider(name = "alleles")
public Object[][] createAllelesData() {
List<Object[]> tests = new ArrayList<Object[]>();
final int[] counts = new int[]{ 0, 5, 10, 15, 20 };
final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 };
for ( final int count1 : counts ) {
for ( final int count2 : counts ) {
for ( final int count3 : counts ) {
for ( final int count4 : counts ) {
for ( final double proportion : proportions ) {
tests.add(new Object[]{new AllelesTest(new int[]{count1, count2, count3, count4}, proportion)});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "alleles", enabled = true)
public void testAlleles(AllelesTest test) {
HeaderElement headerElement = new HeaderElement(1000, 0);
for ( int i = 0; i < test.counts.length; i++ ) {
BaseIndex base = BaseIndex.values()[i];
for ( int j = 0; j < test.counts[i]; j++ )
headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false);
}
final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion);
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion);
Assert.assertEquals(nAllelesSeen, nAllelesExpected);
}
private static int calculateExpectedAlleles(final int[] counts, final double proportion) {
double total = 0.0;
for ( final int count : counts ) {
total += count;
}
final int minCount = (int)(proportion * total);
int result = 0;
for ( final int count : counts ) {
if ( count > 0 && count >= minCount )
result++;
}
return result;
}
}

View File

@ -51,7 +51,6 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
@ -357,7 +356,7 @@ public class SlidingWindowUnitTest extends BaseTest {
@Test(dataProvider = "ConsensusCreation", enabled = true)
public void testConsensusCreationTest(ConsensusCreationTest test) {
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, 1, false);
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
for ( final GATKSAMRecord read : test.myReads )
slidingWindow.addRead(read);
final Pair<Set<GATKSAMRecord>, CompressionStash> result = slidingWindow.close();
@ -390,7 +389,7 @@ public class SlidingWindowUnitTest extends BaseTest {
@Test(dataProvider = "Downsampling", enabled = true)
public void testDownsamplingTest(DSTest test) {
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, test.dcov, ReduceReads.DownsampleStrategy.Normal, false, 1, false);
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, test.dcov, ReduceReads.DownsampleStrategy.Normal, false, false);
final List<GATKSAMRecord> result = slidingWindow.downsampleVariantRegion(basicReads);
Assert.assertEquals(result.size(), Math.min(test.dcov, basicReads.size()));
@ -438,7 +437,7 @@ public class SlidingWindowUnitTest extends BaseTest {
@Test(dataProvider = "ConsensusQuals", enabled = true)
public void testConsensusQualsTest(QualsTest test) {
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, 1, false);
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false, false);
for ( final GATKSAMRecord read : test.myReads )
slidingWindow.addRead(read);
final Pair<Set<GATKSAMRecord>, CompressionStash> result = slidingWindow.close();