From be45edeff2abf3d60c67549b7459dbb0768719f8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 14 Feb 2013 09:36:06 -0800 Subject: [PATCH] ActivityProfile and ActiveRegions respects engine interval boundaries -- Active regions are created as normal, but they are split and trimmed to the engine intervals when added to the traversal, if there are intervals present. -- UnitTests for ActiveRegion.splitAndTrimToIntervals -- GenomeLocSortedSet.getOverlapping uses binary search to efficiently in ~ log N time find overlapping intervals -- UnitTesting overlap function in GenomeLocSortedSet -- Discovered fundamental implementation bug in that adding genome locs out of order (elements on 20 then on 19) produces an invalid GenomeLocSortedSet. Created a JIRA to address this: https://jira.broadinstitute.org/browse/GSA-775 -- Constructor that takes a collection of genome locs now sorts its input and merges overlapping intervals -- Added docs for the constructors in GLSS -- Update HaplotypeCaller MD5s, which change because ActiveRegions are now restricted to the engine intervals, which changes slightly the regions in the tests and so the reads in the regions, and thus the md5s -- GenomeAnalysisEngineUnitTest needs to provide non-null genome loc parser --- .../HaplotypeCallerIntegrationTest.java | 10 +- .../traversals/TraverseActiveRegions.java | 3 +- .../sting/utils/GenomeLocSortedSet.java | 154 ++++++++++++------ .../utils/activeregion/ActiveRegion.java | 32 +++- .../utils/activeregion/ActivityProfile.java | 19 ++- .../activeregion/BandPassActivityProfile.java | 30 ++-- .../gatk/GenomeAnalysisEngineUnitTest.java | 6 +- .../utils/GenomeLocSortedSetUnitTest.java | 65 +++++++- .../activeregion/ActiveRegionUnitTest.java | 103 +++++++++++- .../activeregion/ActivityProfileUnitTest.java | 2 +- .../BandPassActivityProfileUnitTest.java | 10 +- 11 files changed, 348 insertions(+), 86 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 74e28db63..0aa946d67 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "86ceec507e70d542decdae1d20ed6f82"); + "f751363288740c6fd9179a487be61fb4"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "76d4c4a112cf60080adf74c3e116d1fb"); + "262e4c9a55baf1936a65612cfb1f6f81"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "23a4bfa0300683d8cf2ec16ce96e89ad"); + "71ef8d0217c1a73dd360413dccd05f4d"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -153,7 +153,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a77ac53d67937feebfba22a9336a5421")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a4e74226b16a7d8c5999620c2f6be1ba")); executeTest("HCTestStructuralIndels: ", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("255947f39455c87c561be4aee4cab651")); + Arrays.asList("87bd7ac2f7d65580838c7c956ccf52b7")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 5d2aa6be3..64c6d5094 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -109,7 +109,8 @@ public class TraverseActiveRegions extends TraversalEngine * Class GenomeLocCollection *

@@ -59,6 +43,10 @@ import java.util.*; * will also remove a region from the list, if the region to remove is a * partial interval of a region in the collection it will remove the region from * that element. + * + * @author aaron + * Date: May 22, 2009 + * Time: 10:54:40 AM */ public class GenomeLocSortedSet extends AbstractSet { private static Logger logger = Logger.getLogger(GenomeLocSortedSet.class); @@ -66,26 +54,47 @@ public class GenomeLocSortedSet extends AbstractSet { private GenomeLocParser genomeLocParser; // our private storage for the GenomeLoc's - private List mArray = new ArrayList(); + private final List mArray = new ArrayList(); // cache this to make overlap checking much more efficient private int previousOverlapSearchIndex = -1; - /** default constructor */ - public GenomeLocSortedSet(GenomeLocParser parser) { + /** + * Create a new, empty GenomeLocSortedSet + * + * @param parser a non-null the parser we use to create genome locs + */ + public GenomeLocSortedSet(final GenomeLocParser parser) { + if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); this.genomeLocParser = parser; } - public GenomeLocSortedSet(GenomeLocParser parser,GenomeLoc e) { + /** + * Create a new GenomeLocSortedSet containing location e + * + * @param parser a non-null the parser we use to create genome locs + * @param e a single genome locs to add to this set + */ + public GenomeLocSortedSet(final GenomeLocParser parser, final GenomeLoc e) { this(parser); add(e); } - public GenomeLocSortedSet(GenomeLocParser parser,Collection l) { + /** + * Create a new GenomeLocSortedSet containing locations l + * + * The elements in l can be in any order, and can be overlapping. They will be sorted first and + * overlapping (but not contiguous) elements will be merged + * + * @param parser a non-null the parser we use to create genome locs + * @param l a collection of genome locs to add to this set + */ + public GenomeLocSortedSet(final GenomeLocParser parser, final Collection l) { this(parser); - for ( GenomeLoc e : l ) - add(e); + final ArrayList sorted = new ArrayList(l); + Collections.sort(sorted); + mArray.addAll(IntervalUtils.mergeIntervalLocations(sorted, IntervalMergingRule.OVERLAPPING_ONLY)); } /** @@ -198,9 +207,72 @@ public class GenomeLocSortedSet extends AbstractSet { return returnValue; } + /** + * Return a list of intervals overlapping loc + * + * @param loc the location we want overlapping intervals + * @return a non-null list of locations that overlap loc + */ + public List getOverlapping(final GenomeLoc loc) { + // the max ensures that if loc would be the first element, that we start searching at the first element + final int index = Collections.binarySearch(mArray, loc); + if ( index >= 0 ) + // we can safely return a singleton because overlapping regions are merged and loc is exactly in + // the set already + return Collections.singletonList(loc); + + // if loc isn't in the list index is (-(insertion point) - 1). The insertion point is defined as the point at + // which the key would be inserted into the list: the index of the first element greater than the key, or list.size() + // -ins - 1 = index => -ins = index + 1 => ins = -(index + 1) + // Note that we look one before the index in this case, as loc might occur after the previous overlapping interval + final int start = Math.max(-(index + 1) - 1, 0); + final int size = mArray.size(); + + final List overlapping = new LinkedList(); + for ( int i = start; i < size; i++ ) { + final GenomeLoc myLoc = mArray.get(i); + if ( loc.overlapsP(myLoc) ) + overlapping.add(myLoc); + else if ( myLoc.isPast(loc) ) + // since mArray is ordered, if myLoc is past loc that means all future + // intervals cannot overlap loc either. So we can safely abort the search + // note that we need to be a bit conservative on our tests since index needs to start + // at -1 the position of index, so it's possible that myLoc and loc don't overlap but the next + // position might + break; + } + + return overlapping; + } + + /** + * Return a list of intervals overlapping loc by enumerating all locs and testing for overlap + * + * Purely for testing purposes -- this is way to slow for any production code + * + * @param loc the location we want overlapping intervals + * @return a non-null list of locations that overlap loc + */ + protected List getOverlappingFullSearch(final GenomeLoc loc) { + final List overlapping = new LinkedList(); + + // super slow, but definitely works + for ( final GenomeLoc myLoc : mArray ) { + if ( loc.overlapsP(myLoc) ) + overlapping.add(myLoc); + } + + return overlapping; + } + /** * add a genomeLoc to the collection, simply inserting in order into the set * + * TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or + * TODO -- other locations already in the set. This code should check to see if + * TODO -- e is overlapping with its nearby elements and merge them or alternatively + * TODO -- throw an exception + * * @param e the GenomeLoc to add * * @return true @@ -225,6 +297,11 @@ public class GenomeLocSortedSet extends AbstractSet { * Adds a GenomeLoc to the collection, merging it if it overlaps another region. * If it's not overlapping then we add it in sorted order. * + * TODO TODO TODO -- this function is buggy and will not properly create a sorted + * TODO TODO TODO -- genome loc is addRegion is called sequentially where the second + * TODO TODO TODO -- loc added is actually before the first. So when creating + * TODO TODO TODO -- sets make sure to sort the input locations first! + * * @param e the GenomeLoc to add to the collection * * @return true, if the GenomeLoc could be added to the collection @@ -380,31 +457,4 @@ public class GenomeLocSortedSet extends AbstractSet { return s.toString(); } - - /** - * Check to see whether two genomeLocSortedSets are equal. - * Note that this implementation ignores the contigInfo object. - * - */ /* - @Override - public boolean equals(Object other) { - if(other == null) - return false; - if(other instanceof GenomeLocSortedSet) { - // send to a list, so we can ensure order correct - List otherList = ((GenomeLocSortedSet)other).toList(); - List thisList = this.toList(); - if (otherList.size() != this.size()) - return false; - - for (Integer i=0;i splitAndTrimToIntervals(final GenomeLocSortedSet intervals) { + final List allOverlapping = intervals.getOverlapping(getLocation()); + final List clippedRegions = new LinkedList(); + + for ( final GenomeLoc overlapping : allOverlapping ) { + final GenomeLoc subLoc = getLocation().intersect(overlapping); + final int subStart = subLoc.getStart() - getLocation().getStart(); + final int subEnd = subStart + subLoc.size(); + final List subStates = supportingStates.isEmpty() ? supportingStates : supportingStates.subList(subStart, subEnd); + final ActiveRegion clipped = new ActiveRegion( subLoc, subStates, isActive, genomeLocParser, extension ); + clippedRegions.add(clipped); + } + + return clippedRegions; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index ff4673717..25948a857 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.*; @@ -45,6 +46,7 @@ public class ActivityProfile { protected final List stateList; protected final GenomeLocParser parser; + protected final GenomeLocSortedSet restrictToIntervals; protected GenomeLoc regionStartLoc = null; protected GenomeLoc regionStopLoc = null; @@ -60,10 +62,20 @@ public class ActivityProfile { * @param parser the parser we can use to create genome locs, cannot be null */ public ActivityProfile(final GenomeLocParser parser) { + this(parser, null); + } + + /** + * Create a empty ActivityProfile, restricting output to profiles overlapping intervals, if not null + * @param parser the parser we can use to create genome locs, cannot be null + * @param intervals only include states that are within these intervals, if not null + */ + public ActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet intervals) { if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); this.parser = parser; this.stateList = new ArrayList(); + this.restrictToIntervals = intervals; } @Override @@ -224,7 +236,7 @@ public class ActivityProfile { if ( position > size() ) // should we allow this? probably not - throw new IllegalArgumentException("Must add state contiguous to existing states"); + throw new IllegalArgumentException("Must add state contiguous to existing states: adding " + stateToAdd); if ( position >= 0 ) { // ignore states starting before this regions start @@ -313,7 +325,10 @@ public class ActivityProfile { if ( nextRegion == null ) return regions; else { - regions.add(nextRegion); + if ( restrictToIntervals == null ) + regions.add(nextRegion); + else + regions.addAll(nextRegion.splitAndTrimToIntervals(restrictToIntervals)); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java index abbc74df4..f2bc86dfc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.activeregion; import com.google.java.contract.Ensures; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.MathUtils; import java.util.ArrayList; @@ -53,25 +54,34 @@ public class BandPassActivityProfile extends ActivityProfile { private final double[] GaussianKernel; /** - * Create a new BandPassActivityProfile with default sigma and file sizes - * @param parser our genome loc parser + * Create a new BandPassActivityProfile with default sigma and filter sizes + * + * @see #BandPassActivityProfile(org.broadinstitute.sting.utils.GenomeLocParser, org.broadinstitute.sting.utils.GenomeLocSortedSet, int, double, boolean) */ - public BandPassActivityProfile(final GenomeLocParser parser) { - this(parser, MAX_FILTER_SIZE, DEFAULT_SIGMA, true); + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals) { + this(parser, restrictToIntervals, MAX_FILTER_SIZE, DEFAULT_SIGMA, true); + } + + /** + * @see #BandPassActivityProfile(org.broadinstitute.sting.utils.GenomeLocParser, org.broadinstitute.sting.utils.GenomeLocSortedSet, int, double, boolean) + * + * sets adaptiveFilterSize to true + */ + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma) { + this(parser, restrictToIntervals, maxFilterSize, sigma, true); } /** * Create an activity profile that implements a band pass filter on the states + * * @param parser our genome loc parser + * @param restrictToIntervals only include states that are within these intervals, if not null * @param maxFilterSize the maximum size of the band pass filter we are allowed to create, regardless of sigma * @param sigma the variance of the Gaussian kernel for this band pass filter + * @param adaptiveFilterSize if true, use the kernel itself to determine the best filter size */ - public BandPassActivityProfile(final GenomeLocParser parser, final int maxFilterSize, final double sigma) { - this(parser, maxFilterSize, sigma, true); - } - - public BandPassActivityProfile(final GenomeLocParser parser, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) { - super(parser); + public BandPassActivityProfile(final GenomeLocParser parser, final GenomeLocSortedSet restrictToIntervals, final int maxFilterSize, final double sigma, final boolean adaptiveFilterSize) { + super(parser, restrictToIntervals); if ( sigma < 0 ) throw new IllegalArgumentException("Sigma must be greater than or equal to 0 but got " + sigma); diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index af22d852b..0b6e08fa7 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -30,7 +30,9 @@ import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.walkers.readutils.PrintReads; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.Test; import java.io.File; @@ -70,10 +72,12 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { @Test public void testEmptyIntervalSetHandling() throws Exception { + GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); testEngine.setWalker(new PrintReads()); - testEngine.setIntervals(new GenomeLocSortedSet(null)); + testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); testEngine.validateSuppliedIntervals(); } diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index 36e17de80..df41dc642 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -28,18 +28,21 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.Iterator; -import java.util.Arrays; +import java.io.File; +import java.util.*; /** * @@ -321,4 +324,62 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { } assertTrue(seqNumber == GenomeLocSortedSetUnitTest.NUMBER_OF_CHROMOSOMES); } + + // ----------------------------------------------------------------------------------------------- + // + // Test getOverlapping + // + // ----------------------------------------------------------------------------------------------- + + @DataProvider(name = "GetOverlapping") + public Object[][] makeGetOverlappingTest() throws Exception { + final GenomeLocParser genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + + List tests = new ArrayList(); + + final GenomeLoc prev1 = genomeLocParser.createGenomeLoc("19", 1, 10); + final GenomeLoc prev2 = genomeLocParser.createGenomeLoc("19", 20, 50); + final GenomeLoc post1 = genomeLocParser.createGenomeLoc("21", 1, 10); + final GenomeLoc post2 = genomeLocParser.createGenomeLoc("21", 20, 50); + + final int chr20Length = genomeLocParser.getContigs().getSequence("20").getSequenceLength(); + for ( final int regionStart : Arrays.asList(1, 10, chr20Length - 10, chr20Length) ) { + for ( final int regionSize : Arrays.asList(1, 10, 100) ) { + final GenomeLoc region = genomeLocParser.createGenomeLocOnContig("20", regionStart, regionStart + regionSize); + final GenomeLoc spanning = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, region.getStop() + 10); + final GenomeLoc before_into = genomeLocParser.createGenomeLocOnContig("20", regionStart - 10, regionStart + 1); + final GenomeLoc middle = genomeLocParser.createGenomeLocOnContig("20", regionStart + 1, regionStart + 2); + final GenomeLoc middle_past = genomeLocParser.createGenomeLocOnContig("20", region.getStop()-1, region.getStop()+10); + + final List potentials = new LinkedList(); + potentials.add(region); + if ( spanning != null ) potentials.add(spanning); + if ( before_into != null ) potentials.add(before_into); + if ( middle != null ) potentials.add(middle); + if ( middle_past != null ) potentials.add(middle_past); + + for ( final int n : Arrays.asList(1, 2, 3) ) { + for ( final List regions : Utils.makePermutations(potentials, n, false) ) { + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, regions), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1)), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2)), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1)), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, post1, post2)), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, post1)), region}); + tests.add(new Object[]{new GenomeLocSortedSet(genomeLocParser, Utils.append(regions, prev1, prev2, post1, post2)), region}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "GetOverlapping") + public void testGetOverlapping(final GenomeLocSortedSet intervals, final GenomeLoc region) { + final List expectedOverlapping = intervals.getOverlappingFullSearch(region); + final List actualOverlapping = intervals.getOverlapping(region); + Assert.assertEquals(actualOverlapping, expectedOverlapping); + Assert.assertEquals(intervals.overlaps(region), ! expectedOverlapping.isEmpty(), "GenomeLocSortedSet.overlaps didn't return expected result"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java index d2ea5d11b..6ab429015 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -34,6 +34,7 @@ import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,6 +49,7 @@ import java.util.*; public class ActiveRegionUnitTest extends BaseTest { + private final static boolean DEBUG = true; private GenomeLocParser genomeLocParser; private IndexedFastaSequenceFile seq; private String contig; @@ -88,7 +90,7 @@ public class ActiveRegionUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(enabled = true, dataProvider = "ActionRegionCreationTest") + @Test(enabled = !DEBUG, dataProvider = "ActionRegionCreationTest") public void testCreatingActiveRegions(final GenomeLoc loc, final List supportingStates, final boolean isActive, final int extension) { final ActiveRegion region = new ActiveRegion(loc, supportingStates, isActive, genomeLocParser, extension); Assert.assertEquals(region.getLocation(), loc); @@ -141,7 +143,7 @@ public class ActiveRegionUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ActiveRegionReads") + @Test(enabled = !DEBUG, dataProvider = "ActiveRegionReads") public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) { final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read)); @@ -197,6 +199,12 @@ public class ActiveRegionUnitTest extends BaseTest { Assert.assertTrue(region.getReads().get(0).getAlignmentEnd() <= region.getExtendedLoc().getStop()); } + // ----------------------------------------------------------------------------------------------- + // + // Make sure bad inputs are properly detected + // + // ----------------------------------------------------------------------------------------------- + @DataProvider(name = "BadReadsTest") public Object[][] makeBadReadsTest() { List tests = new ArrayList(); @@ -213,11 +221,100 @@ public class ActiveRegionUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BadReadsTest", expectedExceptions = IllegalArgumentException.class) + @Test(enabled = !DEBUG, dataProvider = "BadReadsTest", expectedExceptions = IllegalArgumentException.class) public void testBadReads(final GATKSAMRecord read1, final GATKSAMRecord read2) { final GenomeLoc loc = genomeLocParser.createGenomeLoc(read1); final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); region.add(read1); region.add(read2); } + + // ----------------------------------------------------------------------------------------------- + // + // Make sure we can properly cut up an active region based on engine intervals + // + // ----------------------------------------------------------------------------------------------- + + @DataProvider(name = "SplitActiveRegion") + public Object[][] makeSplitActiveRegion() { + List tests = new ArrayList(); + + final GenomeLoc whole_span = genomeLocParser.createGenomeLoc("20", 1, 500); + final GenomeLoc gl_before = genomeLocParser.createGenomeLoc("20", 1, 9); + final GenomeLoc gl_after = genomeLocParser.createGenomeLoc("20", 250, 500); + final GenomeLoc gl_diff_contig = genomeLocParser.createGenomeLoc("19", 40, 50); + + final int regionStart = 10; + final int regionStop = 100; + final GenomeLoc region = genomeLocParser.createGenomeLoc("20", regionStart, regionStop); + + for ( final GenomeLoc noEffect : Arrays.asList(whole_span) ) + tests.add(new Object[]{ + region, + Arrays.asList(noEffect), + Arrays.asList(region)}); + + for ( final GenomeLoc noOverlap : Arrays.asList(gl_before, gl_after, gl_diff_contig) ) + tests.add(new Object[]{ + region, + Arrays.asList(noOverlap), + Arrays.asList()}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 5, 50)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", regionStart, 50))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 50, 200)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", 50, regionStop))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 40, 50)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", 40, 50))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 20, 30), genomeLocParser.createGenomeLoc("20", 40, 50)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", 20, 30), genomeLocParser.createGenomeLoc("20", 40, 50))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 1, 30), genomeLocParser.createGenomeLoc("20", 40, 50)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", regionStart, 30), genomeLocParser.createGenomeLoc("20", 40, 50))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 1, 30), genomeLocParser.createGenomeLoc("20", 70, 200)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", regionStart, 30), genomeLocParser.createGenomeLoc("20", 70, regionStop))}); + + tests.add(new Object[]{region, + Arrays.asList(genomeLocParser.createGenomeLoc("20", 1, 30), genomeLocParser.createGenomeLoc("20", 40, 50), genomeLocParser.createGenomeLoc("20", 70, 200)), + Arrays.asList(genomeLocParser.createGenomeLoc("20", regionStart, 30), genomeLocParser.createGenomeLoc("20", 40, 50), genomeLocParser.createGenomeLoc("20", 70, regionStop))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SplitActiveRegion") + public void testSplitActiveRegion(final GenomeLoc regionLoc, final List intervalLocs, final List expectedRegionLocs) { + for ( final boolean addSubstates : Arrays.asList(true, false) ) { + final List states; + if ( addSubstates ) { + states = new LinkedList(); + for ( int i = 0; i < regionLoc.size(); i++ ) + states.add(new ActivityProfileState(genomeLocParser.createGenomeLoc(regionLoc.getContig(), regionLoc.getStart() + i), 1.0)); + } else { + states = null; + } + + final ActiveRegion region = new ActiveRegion(regionLoc, states, true, genomeLocParser, 0); + final GenomeLocSortedSet intervals = new GenomeLocSortedSet(genomeLocParser, intervalLocs); + final List regions = region.splitAndTrimToIntervals(intervals); + + Assert.assertEquals(regions.size(), expectedRegionLocs.size(), "Wrong number of split locations"); + for ( int i = 0; i < expectedRegionLocs.size(); i++ ) { + final GenomeLoc expected = expectedRegionLocs.get(i); + final ActiveRegion actual = regions.get(i); + Assert.assertEquals(actual.getLocation(), expected, "Bad region after split"); + Assert.assertEquals(actual.isActive(), region.isActive()); + Assert.assertEquals(actual.getExtension(), region.getExtension()); + } + } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index b9fdb3afe..9be250b8e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -89,7 +89,7 @@ public class ActivityProfileUnitTest extends BaseTest { case Base: return new ActivityProfile(genomeLocParser); case BandPass: // zero size => equivalent to ActivityProfile - return new BandPassActivityProfile(genomeLocParser, 0, 0.01, false); + return new BandPassActivityProfile(genomeLocParser, null, 0, 0.01, false); default: throw new IllegalStateException(type.toString()); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index 787db9a0f..d5231c30b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -87,7 +87,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest") public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, sigma, false); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, sigma, false); final int expectedBandSize = bandPassSize * 2 + 1; Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size"); @@ -142,7 +142,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition") public void testBandPassComposition(final int bandPassSize, final int integrationLength) { final int start = 1; - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; // add a buffer so that we can get all of the band pass values @@ -215,7 +215,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { @Test( enabled = ! DEBUG, dataProvider = "KernelCreation") public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, maxSize, sigma, true); + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, maxSize, sigma, true); final double[] kernel = profile.getKernel(); Assert.assertEquals(kernel.length, expectedKernel.length); @@ -255,8 +255,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { final Pair reader = GATKVCFUtils.readAllVCs(file, codec); final List incRegions = new ArrayList(); - final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser); - final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser); + final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null); + final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null); int pos = start; for ( final VariantContext vc : reader.getSecond() ) { if ( vc == null ) continue;