diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 14a8e6dcb..ae181463b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -395,6 +395,13 @@ public class ActivityProfile { "result == -1 || result < maxRegionSize", "! (result == -1 && forceConversion)"}) private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) { + if ( ! forceConversion && stateList.size() < maxRegionSize + getMaxProbPropagationDistance() ) { + // we really haven't finalized at the probability mass that might affect our decision, so keep + // waiting until we do before we try to make any decisions + return -1; + } + + // TODO -- don't need startSearchForEndOfRegionHere with the above check int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere); startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0); @@ -403,9 +410,7 @@ public class ActivityProfile { endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize); // we're one past the end, so i must be decremented - return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() - ? endOfActiveRegion - 1 - : -1; + return endOfActiveRegion - 1; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java index 5bba7db17..309405be0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java @@ -72,6 +72,8 @@ public class ActivityProfileState { // make sure the location of that activity profile is 1 if ( loc.size() != 1 ) throw new IllegalArgumentException("Location for an ActivityProfileState must have to size 1 bp but saw " + loc); + if ( resultValue != null && resultValue.doubleValue() < 0 ) + throw new IllegalArgumentException("Result value isn't null and its < 0, which is illegal: " + resultValue); this.loc = loc; this.isActiveProb = isActiveProb; diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java index a1f1fc580..abbc74df4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java @@ -52,6 +52,14 @@ public class BandPassActivityProfile extends ActivityProfile { private final double sigma; private final double[] GaussianKernel; + /** + * Create a new BandPassActivityProfile with default sigma and file sizes + * @param parser our genome loc parser + */ + public BandPassActivityProfile(final GenomeLocParser parser) { + this(parser, MAX_FILTER_SIZE, DEFAULT_SIGMA, true); + } + /** * Create an activity profile that implements a band pass filter on the states * @param parser our genome loc parser diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index 540e9470a..2c7d78169 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -35,7 +35,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextTestProvider; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -47,12 +52,13 @@ import java.util.*; public class BandPassActivityProfileUnitTest extends BaseTest { + private final static boolean DEBUG = false; private GenomeLocParser genomeLocParser; @BeforeClass public void init() throws FileNotFoundException { // sequence - ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); genomeLocParser = new GenomeLocParser(seq); } @@ -79,7 +85,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BandPassBasicTest") + @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest") public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) { final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, sigma, false); @@ -133,7 +139,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test( dataProvider = "BandPassComposition") + @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition") public void testBandPassComposition(final int bandPassSize, final int integrationLength) { final int start = 1; final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); @@ -207,7 +213,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test( dataProvider = "KernelCreation") + @Test( enabled = ! DEBUG, dataProvider = "KernelCreation") public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) { final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, maxSize, sigma, true); @@ -216,4 +222,100 @@ public class BandPassActivityProfileUnitTest extends BaseTest { for ( int i = 0; i < kernel.length; i++ ) Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i); } + + // ------------------------------------------------------------------------------------ + // + // Large-scale test, reading in 1000G Phase I chr20 calls and making sure that + // the regions returned are the same if you run on the entire profile vs. doing it + // incremental + // + // ------------------------------------------------------------------------------------ + + @DataProvider(name = "VCFProfile") + public Object[][] makeVCFProfile() { + final List tests = new LinkedList(); + + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000}); + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000}); + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); + tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); + tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000}); + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "VCFProfile") + public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception { + final int extension = 50; + final int minRegionSize = 50; + final int maxRegionSize = 300; + + final File file = new File(path); + final VCFCodec codec = new VCFCodec(); + final Pair> reader = VariantContextTestProvider.readAllVCs(file, codec); + + final List incRegions = new ArrayList(); + final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser); + final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser); + int pos = start; + for ( final VariantContext vc : reader.getSecond() ) { + if ( vc == null ) continue; + while ( pos < vc.getStart() ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); + //logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart()); + incProfile.add(new ActivityProfileState(loc, 0.0)); + fullProfile.add(new ActivityProfileState(loc, 0.0)); + pos++; + } + if ( vc.getStart() >= start && vc.getEnd() <= end ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); + //logger.warn("Adding 1.0 at " + loc); + ActivityProfileState.Type type = ActivityProfileState.Type.NONE; + Number value = null; + if ( vc.isBiallelic() && vc.isIndel() ) { + type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS; + value = Math.abs(vc.getIndelLengths().get(0)); + } + final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value); + incProfile.add(state); + fullProfile.add(state); + pos++; + } + + incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false)); + + if ( vc.getStart() > end ) + break; + } + + incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true)); + + final List fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true); + assertGoodRegions(fullRegions, start, end, maxRegionSize); + assertGoodRegions(incRegions, start, end, maxRegionSize); + + Assert.assertEquals(incRegions.size(), fullRegions.size(), "incremental and full region sizes aren't the same"); + for ( int i = 0; i < fullRegions.size(); i++ ) { + final ActiveRegion incRegion = incRegions.get(i); + final ActiveRegion fullRegion = fullRegions.get(i); + Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion); + } + } + + private void assertGoodRegions(final List regions, final int start, final int end, final int maxRegionSize) { + int lastPosSeen = start - 1; + for ( int regionI = 0; regionI < regions.size(); regionI++ ) { + final ActiveRegion region = regions.get(regionI); + Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region. lastPosSeen " + lastPosSeen + " but region is " + region); + Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region); + lastPosSeen = region.getLocation().getStop(); + + for ( final ActivityProfileState state : region.getSupportingStates() ) { + Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive, + "Region is active=" + region.isActive + " but contains a state " + state + " with prob " + + state.isActiveProb + " not within expected values given threshold for activity of " + + ActivityProfile.ACTIVE_PROB_THRESHOLD); + } + } + } } diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java index 0a2dc384e..f489806f6 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider.java @@ -688,7 +688,7 @@ public class VariantContextTestProvider { * @return * @throws IOException */ - private final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + public final static Pair> readAllVCs( final File source, final FeatureCodec codec ) throws IOException { // read in the features PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); FeatureCodecHeader header = codec.readHeader(pbs); @@ -701,7 +701,7 @@ public class VariantContextTestProvider { return new Pair>(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); } - private static class VCIterable implements Iterable, Iterator { + public static class VCIterable implements Iterable, Iterator { final PositionalBufferedStream pbs; final FeatureCodec codec; final VCFHeader header;