Added realistic BandPassFilterUnitTest that ensures quality results for 1000G phase I VCF and NA12878 VCF

-- Helped ID more bugs in the ActivityProfile, necessitating a new algorithm for popping off active regions.  This new algorithm requires that at least maxRegionSize + prob. propagation distance states have been examined.  This ensures that the incremental results are the same as you get reading in an entire profile and running getRegions on the full profile
-- TODO is to remove incremental search start algorithm, as this is no longer necessary, and nicely eliminates a state variable I was always uncomfortable with
This commit is contained in:
Mark DePristo 2013-01-27 13:39:35 -05:00
parent 72b2e77eed
commit c97a361b5d
5 changed files with 126 additions and 9 deletions

View File

@ -395,6 +395,13 @@ public class ActivityProfile {
"result == -1 || result < maxRegionSize",
"! (result == -1 && forceConversion)"})
private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) {
if ( ! forceConversion && stateList.size() < maxRegionSize + getMaxProbPropagationDistance() ) {
// we really haven't finalized at the probability mass that might affect our decision, so keep
// waiting until we do before we try to make any decisions
return -1;
}
// TODO -- don't need startSearchForEndOfRegionHere with the above check
int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere);
startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0);
@ -403,9 +410,7 @@ public class ActivityProfile {
endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize);
// we're one past the end, so i must be decremented
return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size()
? endOfActiveRegion - 1
: -1;
return endOfActiveRegion - 1;
}
/**

View File

@ -72,6 +72,8 @@ public class ActivityProfileState {
// make sure the location of that activity profile is 1
if ( loc.size() != 1 )
throw new IllegalArgumentException("Location for an ActivityProfileState must have to size 1 bp but saw " + loc);
if ( resultValue != null && resultValue.doubleValue() < 0 )
throw new IllegalArgumentException("Result value isn't null and its < 0, which is illegal: " + resultValue);
this.loc = loc;
this.isActiveProb = isActiveProb;

View File

@ -52,6 +52,14 @@ public class BandPassActivityProfile extends ActivityProfile {
private final double sigma;
private final double[] GaussianKernel;
/**
* Create a new BandPassActivityProfile with default sigma and file sizes
* @param parser our genome loc parser
*/
public BandPassActivityProfile(final GenomeLocParser parser) {
this(parser, MAX_FILTER_SIZE, DEFAULT_SIGMA, true);
}
/**
* Create an activity profile that implements a band pass filter on the states
* @param parser our genome loc parser

View File

@ -35,7 +35,12 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.broadinstitute.variant.vcf.VCFHeader;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@ -47,12 +52,13 @@ import java.util.*;
public class BandPassActivityProfileUnitTest extends BaseTest {
private final static boolean DEBUG = false;
private GenomeLocParser genomeLocParser;
@BeforeClass
public void init() throws FileNotFoundException {
// sequence
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
genomeLocParser = new GenomeLocParser(seq);
}
@ -79,7 +85,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BandPassBasicTest")
@Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest")
public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) {
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, sigma, false);
@ -133,7 +139,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "BandPassComposition")
@Test( enabled = ! DEBUG, dataProvider = "BandPassComposition")
public void testBandPassComposition(final int bandPassSize, final int integrationLength) {
final int start = 1;
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA);
@ -207,7 +213,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "KernelCreation")
@Test( enabled = ! DEBUG, dataProvider = "KernelCreation")
public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) {
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, maxSize, sigma, true);
@ -216,4 +222,100 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
for ( int i = 0; i < kernel.length; i++ )
Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i);
}
// ------------------------------------------------------------------------------------
//
// Large-scale test, reading in 1000G Phase I chr20 calls and making sure that
// the regions returned are the same if you run on the entire profile vs. doing it
// incremental
//
// ------------------------------------------------------------------------------------
@DataProvider(name = "VCFProfile")
public Object[][] makeVCFProfile() {
final List<Object[]> tests = new LinkedList<Object[]>();
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000});
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000});
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000});
return tests.toArray(new Object[][]{});
}
@Test( dataProvider = "VCFProfile")
public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception {
final int extension = 50;
final int minRegionSize = 50;
final int maxRegionSize = 300;
final File file = new File(path);
final VCFCodec codec = new VCFCodec();
final Pair<VCFHeader, Iterable<VariantContext>> reader = VariantContextTestProvider.readAllVCs(file, codec);
final List<ActiveRegion> incRegions = new ArrayList<ActiveRegion>();
final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser);
final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser);
int pos = start;
for ( final VariantContext vc : reader.getSecond() ) {
if ( vc == null ) continue;
while ( pos < vc.getStart() ) {
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
//logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart());
incProfile.add(new ActivityProfileState(loc, 0.0));
fullProfile.add(new ActivityProfileState(loc, 0.0));
pos++;
}
if ( vc.getStart() >= start && vc.getEnd() <= end ) {
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
//logger.warn("Adding 1.0 at " + loc);
ActivityProfileState.Type type = ActivityProfileState.Type.NONE;
Number value = null;
if ( vc.isBiallelic() && vc.isIndel() ) {
type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS;
value = Math.abs(vc.getIndelLengths().get(0));
}
final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value);
incProfile.add(state);
fullProfile.add(state);
pos++;
}
incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false));
if ( vc.getStart() > end )
break;
}
incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true));
final List<ActiveRegion> fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true);
assertGoodRegions(fullRegions, start, end, maxRegionSize);
assertGoodRegions(incRegions, start, end, maxRegionSize);
Assert.assertEquals(incRegions.size(), fullRegions.size(), "incremental and full region sizes aren't the same");
for ( int i = 0; i < fullRegions.size(); i++ ) {
final ActiveRegion incRegion = incRegions.get(i);
final ActiveRegion fullRegion = fullRegions.get(i);
Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion);
}
}
private void assertGoodRegions(final List<ActiveRegion> regions, final int start, final int end, final int maxRegionSize) {
int lastPosSeen = start - 1;
for ( int regionI = 0; regionI < regions.size(); regionI++ ) {
final ActiveRegion region = regions.get(regionI);
Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region. lastPosSeen " + lastPosSeen + " but region is " + region);
Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region);
lastPosSeen = region.getLocation().getStop();
for ( final ActivityProfileState state : region.getSupportingStates() ) {
Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive,
"Region is active=" + region.isActive + " but contains a state " + state + " with prob "
+ state.isActiveProb + " not within expected values given threshold for activity of "
+ ActivityProfile.ACTIVE_PROB_THRESHOLD);
}
}
}
}

View File

@ -688,7 +688,7 @@ public class VariantContextTestProvider {
* @return
* @throws IOException
*/
private final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
public final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
// read in the features
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
FeatureCodecHeader header = codec.readHeader(pbs);
@ -701,7 +701,7 @@ public class VariantContextTestProvider {
return new Pair<VCFHeader, Iterable<VariantContext>>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
}
private static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
public static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
final PositionalBufferedStream pbs;
final FeatureCodec<VariantContext> codec;
final VCFHeader header;