Added realistic BandPassFilterUnitTest that ensures quality results for 1000G phase I VCF and NA12878 VCF
-- Helped ID more bugs in the ActivityProfile, necessitating a new algorithm for popping off active regions. This new algorithm requires that at least maxRegionSize + prob. propagation distance states have been examined. This ensures that the incremental results are the same as you get reading in an entire profile and running getRegions on the full profile -- TODO is to remove incremental search start algorithm, as this is no longer necessary, and nicely eliminates a state variable I was always uncomfortable with
This commit is contained in:
parent
72b2e77eed
commit
c97a361b5d
|
|
@ -395,6 +395,13 @@ public class ActivityProfile {
|
|||
"result == -1 || result < maxRegionSize",
|
||||
"! (result == -1 && forceConversion)"})
|
||||
private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) {
|
||||
if ( ! forceConversion && stateList.size() < maxRegionSize + getMaxProbPropagationDistance() ) {
|
||||
// we really haven't finalized at the probability mass that might affect our decision, so keep
|
||||
// waiting until we do before we try to make any decisions
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO -- don't need startSearchForEndOfRegionHere with the above check
|
||||
int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere);
|
||||
startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0);
|
||||
|
||||
|
|
@ -403,9 +410,7 @@ public class ActivityProfile {
|
|||
endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize);
|
||||
|
||||
// we're one past the end, so i must be decremented
|
||||
return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size()
|
||||
? endOfActiveRegion - 1
|
||||
: -1;
|
||||
return endOfActiveRegion - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -72,6 +72,8 @@ public class ActivityProfileState {
|
|||
// make sure the location of that activity profile is 1
|
||||
if ( loc.size() != 1 )
|
||||
throw new IllegalArgumentException("Location for an ActivityProfileState must have to size 1 bp but saw " + loc);
|
||||
if ( resultValue != null && resultValue.doubleValue() < 0 )
|
||||
throw new IllegalArgumentException("Result value isn't null and its < 0, which is illegal: " + resultValue);
|
||||
|
||||
this.loc = loc;
|
||||
this.isActiveProb = isActiveProb;
|
||||
|
|
|
|||
|
|
@ -52,6 +52,14 @@ public class BandPassActivityProfile extends ActivityProfile {
|
|||
private final double sigma;
|
||||
private final double[] GaussianKernel;
|
||||
|
||||
/**
|
||||
* Create a new BandPassActivityProfile with default sigma and file sizes
|
||||
* @param parser our genome loc parser
|
||||
*/
|
||||
public BandPassActivityProfile(final GenomeLocParser parser) {
|
||||
this(parser, MAX_FILTER_SIZE, DEFAULT_SIGMA, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an activity profile that implements a band pass filter on the states
|
||||
* @param parser our genome loc parser
|
||||
|
|
|
|||
|
|
@ -35,7 +35,12 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextTestProvider;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
|
@ -47,12 +52,13 @@ import java.util.*;
|
|||
|
||||
|
||||
public class BandPassActivityProfileUnitTest extends BaseTest {
|
||||
private final static boolean DEBUG = false;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference));
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
|
|
@ -79,7 +85,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BandPassBasicTest")
|
||||
@Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest")
|
||||
public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) {
|
||||
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, sigma, false);
|
||||
|
||||
|
|
@ -133,7 +139,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test( dataProvider = "BandPassComposition")
|
||||
@Test( enabled = ! DEBUG, dataProvider = "BandPassComposition")
|
||||
public void testBandPassComposition(final int bandPassSize, final int integrationLength) {
|
||||
final int start = 1;
|
||||
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA);
|
||||
|
|
@ -207,7 +213,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test( dataProvider = "KernelCreation")
|
||||
@Test( enabled = ! DEBUG, dataProvider = "KernelCreation")
|
||||
public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) {
|
||||
final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, maxSize, sigma, true);
|
||||
|
||||
|
|
@ -216,4 +222,100 @@ public class BandPassActivityProfileUnitTest extends BaseTest {
|
|||
for ( int i = 0; i < kernel.length; i++ )
|
||||
Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
//
|
||||
// Large-scale test, reading in 1000G Phase I chr20 calls and making sure that
|
||||
// the regions returned are the same if you run on the entire profile vs. doing it
|
||||
// incremental
|
||||
//
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "VCFProfile")
|
||||
public Object[][] makeVCFProfile() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
|
||||
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000});
|
||||
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000});
|
||||
//tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
|
||||
tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000});
|
||||
tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test( dataProvider = "VCFProfile")
|
||||
public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception {
|
||||
final int extension = 50;
|
||||
final int minRegionSize = 50;
|
||||
final int maxRegionSize = 300;
|
||||
|
||||
final File file = new File(path);
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final Pair<VCFHeader, Iterable<VariantContext>> reader = VariantContextTestProvider.readAllVCs(file, codec);
|
||||
|
||||
final List<ActiveRegion> incRegions = new ArrayList<ActiveRegion>();
|
||||
final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser);
|
||||
final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser);
|
||||
int pos = start;
|
||||
for ( final VariantContext vc : reader.getSecond() ) {
|
||||
if ( vc == null ) continue;
|
||||
while ( pos < vc.getStart() ) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
|
||||
//logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart());
|
||||
incProfile.add(new ActivityProfileState(loc, 0.0));
|
||||
fullProfile.add(new ActivityProfileState(loc, 0.0));
|
||||
pos++;
|
||||
}
|
||||
if ( vc.getStart() >= start && vc.getEnd() <= end ) {
|
||||
final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos);
|
||||
//logger.warn("Adding 1.0 at " + loc);
|
||||
ActivityProfileState.Type type = ActivityProfileState.Type.NONE;
|
||||
Number value = null;
|
||||
if ( vc.isBiallelic() && vc.isIndel() ) {
|
||||
type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS;
|
||||
value = Math.abs(vc.getIndelLengths().get(0));
|
||||
}
|
||||
final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value);
|
||||
incProfile.add(state);
|
||||
fullProfile.add(state);
|
||||
pos++;
|
||||
}
|
||||
|
||||
incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false));
|
||||
|
||||
if ( vc.getStart() > end )
|
||||
break;
|
||||
}
|
||||
|
||||
incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true));
|
||||
|
||||
final List<ActiveRegion> fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true);
|
||||
assertGoodRegions(fullRegions, start, end, maxRegionSize);
|
||||
assertGoodRegions(incRegions, start, end, maxRegionSize);
|
||||
|
||||
Assert.assertEquals(incRegions.size(), fullRegions.size(), "incremental and full region sizes aren't the same");
|
||||
for ( int i = 0; i < fullRegions.size(); i++ ) {
|
||||
final ActiveRegion incRegion = incRegions.get(i);
|
||||
final ActiveRegion fullRegion = fullRegions.get(i);
|
||||
Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertGoodRegions(final List<ActiveRegion> regions, final int start, final int end, final int maxRegionSize) {
|
||||
int lastPosSeen = start - 1;
|
||||
for ( int regionI = 0; regionI < regions.size(); regionI++ ) {
|
||||
final ActiveRegion region = regions.get(regionI);
|
||||
Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region. lastPosSeen " + lastPosSeen + " but region is " + region);
|
||||
Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region);
|
||||
lastPosSeen = region.getLocation().getStop();
|
||||
|
||||
for ( final ActivityProfileState state : region.getSupportingStates() ) {
|
||||
Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive,
|
||||
"Region is active=" + region.isActive + " but contains a state " + state + " with prob "
|
||||
+ state.isActiveProb + " not within expected values given threshold for activity of "
|
||||
+ ActivityProfile.ACTIVE_PROB_THRESHOLD);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -688,7 +688,7 @@ public class VariantContextTestProvider {
|
|||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
|
||||
public final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
|
||||
// read in the features
|
||||
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
|
||||
FeatureCodecHeader header = codec.readHeader(pbs);
|
||||
|
|
@ -701,7 +701,7 @@ public class VariantContextTestProvider {
|
|||
return new Pair<VCFHeader, Iterable<VariantContext>>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
|
||||
}
|
||||
|
||||
private static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
|
||||
public static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
|
||||
final PositionalBufferedStream pbs;
|
||||
final FeatureCodec<VariantContext> codec;
|
||||
final VCFHeader header;
|
||||
|
|
|
|||
Loading…
Reference in New Issue