From bb2c10b785fa8ad951639429cd4815a9b46ceece Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Mar 2012 12:16:04 -0400 Subject: [PATCH 01/10] Capture the class of the exception in GATKRunReport -- As suggested by David. --- .../broadinstitute/sting/gatk/phonehome/GATKRunReport.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index bc7d5c6ca..3bc336124 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -344,8 +344,12 @@ public class GATKRunReport { @Element(required = false, name = "is-user-exception") Boolean isUserException; + @Element(required = false, name = "exception-class") + Class exceptionClass; + public ExceptionToXML(Throwable e) { message = e.getMessage(); + exceptionClass = e.getClass(); isUserException = e instanceof UserException; for (StackTraceElement element : e.getStackTrace()) { stackTrace.add(element.toString()); From eca055ccadaeb4052529d73c87944e51ca6ea3ba Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 14 Mar 2012 14:26:40 -0400 Subject: [PATCH 03/10] Add option in ValidationAmplicons to only output SNPs and INDELs, ignoring complex variants (or SVs, etc.) --- .../walkers/validation/ValidationAmplicons.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 1d7f92242..3d281ef6c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -117,6 +117,13 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false) boolean onlyOutputValidAmplicons = false; + /** + * If ignoreComplexEvents is true, the output fasta file will contain only sequences coming from SNPs and Indels. + * Complex substitutions will be ignored. + */ + @Argument(doc="Ignore complex genomic records.",fullName="ignoreComplexEvents",required=false) + boolean ignoreComplexEvents = false; + /** * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. * This changes the size of the k-mer used for alignment. @@ -146,6 +153,7 @@ public class ValidationAmplicons extends RodWalker { StringBuilder rawSequence; boolean sequenceInvalid; boolean isSiteSNP; + boolean isSiteIndel; List invReason; int indelCounter; @@ -244,6 +252,7 @@ public class ValidationAmplicons extends RodWalker { } else if ( validate != null ) { // record variant type in case it's needed in output format isSiteSNP = (validate.isSNP()); + isSiteIndel = (validate.isIndel()); // doesn't matter if there's a mask here too -- this is what we want to validate if ( validate.isFiltered() ) { logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site."); @@ -504,6 +513,9 @@ public class ValidationAmplicons extends RodWalker { } + if (ignoreComplexEvents && !isSiteIndel && !isSiteSNP) + return; + if (!onlyOutputValidAmplicons || !sequenceInvalid) { String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); if (sequenomOutput) { @@ -512,7 +524,7 @@ public class ValidationAmplicons extends RodWalker { out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); } else if (ilmnOutput) { - String type = isSiteSNP?"SNP":"INDEL"; + String type = isSiteSNP?"SNP":(isSiteIndel?"INDEL":"OTHER"); seqIdentity = seqIdentity.replace("*",""); // no * in ref allele out.printf("%s,%s,%s,%s,%d,37,1000G,ExomePhase1,Forward,Plus,FALSE%n",probeName,type,seqIdentity,allelePos.getContig(),allelePos.getStart()); } From e73406b9b50bbfbce270c50641dafc7139ac0eed Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Mar 2012 09:43:45 -0400 Subject: [PATCH 05/10] CountReadsInActiveRegions now emits a detailed GATK report -- This report details which intervals are coming in and how many reads they contain -- Added integration test to verify that the intervals aren't changing, before heading into the ART refactor --- ...ntReadsInActiveRegionsIntegrationTest.java | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java new file mode 100644 index 000000000..44cf87b45 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.activeregionqc; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests CountReadsInActiveRegions + */ +public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest { + @Test + public void basicTest() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", + 1, + Arrays.asList("fcd581aa6befe85c7297509fa7b34edf")); + executeTest("CountReadsInActiveRegions:", spec); + } +} \ No newline at end of file From 5bcb5c743326e671964a7a17d2498f1277e74cdd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Mar 2012 11:21:31 -0400 Subject: [PATCH 06/10] Preliminary refactoring of ART -- Refactored ART into clearer, simpler procedures. Attempted to merge shared code into utility classes. -- Added some docs -- Created a new, testable ActivityProfile that represents as a class the probability of a base being active or inactive -- Separated band-pass filtering from creation of active regions. Now you can band pass filter a profile to make another profile, and then that is explicitly converted to active regions -- Misc. utility functions in ActiveRegionWalker such as hasPresetActiveRegions() -- Many TODOs in ActivityProfile. --- .../traversals/TraverseActiveRegions.java | 155 +++++++++--------- .../gatk/walkers/ActiveRegionWalker.java | 4 + .../utils/activeregion/ActivityProfile.java | 144 ++++++++++++++++ 3 files changed, 225 insertions(+), 78 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3f24e6585..c0fc78e3c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -42,38 +44,31 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); - GenomeLoc firstIsActiveStart = null; + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while( locusView.hasNext() ) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); + if(prevLoc != null) { - for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) { + // fill in the active / inactive labels from the stop of the previous location to the start of this location + // TODO refactor to separate function + for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) ); - isActiveList.add( isActiveProb ); - if( firstIsActiveStart == null ) { - firstIsActiveStart = fakeLoc; - } + final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ); + profile.add(fakeLoc, isActiveProb); } } } @@ -89,12 +84,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null ); - logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); - if( walker.activeRegionOutStream == null ) { - workQueue.addAll( activeRegions ); + // band-pass filter the list of isActive probabilities and turn into active regions + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // add to work queue + if( walker.activeRegionOutStream == null ) { + workQueue.addAll( activeRegions ); } else { // Just want to output the active regions to a file, not actually process them for( final ActiveRegion activeRegion : activeRegions ) { if( activeRegion.isActive ) { @@ -134,21 +130,55 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum) { + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final double walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0; + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them while( workQueue.peek() != null ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker) walker ); + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } else { + break; + } } return sum; @@ -193,6 +223,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) { - - final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author - final ArrayList returnList = new ArrayList(); - if( activeList.size() == 0 ) { - return returnList; - } else if( activeList.size() == 1 ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()), - activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) ); - return returnList; - } else { - final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]); - final double[] filteredProbArray = new double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } - } - filteredProbArray[iii] = maxVal; - } - - boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD; - int curStart = 0; - for(int iii = 1; iii < filteredProbArray.length; iii++ ) { - final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD; - if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - curStatus = thisStatus; - curStart = iii; - } - } - if( curStart != filteredProbArray.length-1 ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - } - return returnList; - } + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 6403f15a2..8ff4b2f6f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -45,6 +45,10 @@ public abstract class ActiveRegionWalker extends Walker isActiveList; + + // todo -- add upfront the start and stop of the intervals + // todo -- check that no regions are unexpectedly missing + // todo -- add unit tests + // TODO -- own preset regions + public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) { + this(parser, presetRegions, new ArrayList(), null); + } + + protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List isActiveList, final GenomeLoc regionStartLoc) { + this.parser = parser; + this.presetRegions = presetRegions; + this.isActiveList = isActiveList; + this.regionStartLoc = regionStartLoc; + } + + public void add(final GenomeLoc loc, final double score) { + // todo -- test for validity + isActiveList.add(score); + if( regionStartLoc == null ) { + regionStartLoc = loc; + } + } + + public int size() { + return isActiveList.size(); + } + + /** + * Band pass this ActivityProfile, producing a new profile that's band pass filtered + * @return a new ActivityProfile that's the band-pass filtered version of this profile + */ + public ActivityProfile bandPassFilter() { + final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); + final Double[] filteredProbArray = new Double[activeProbArray.length]; + final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + double maxVal = 0; + for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { + if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + } + filteredProbArray[iii] = maxVal; + } + + return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); + } + + /** + * Partition this profile into active regions + * @param activeRegionExtension + * @return + */ + public List createActiveRegions( final int activeRegionExtension ) { + final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author + final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author + + if( isActiveList.size() == 0 ) { + // no elements in the active list, just return an empty one + return Collections.emptyList(); + } else if( isActiveList.size() == 1 ) { + // there's a single element, it's either active or inactive + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); + return Collections.singletonList(region); + } else { + // there are 2+ elements, divide these up into regions + final ArrayList returnList = new ArrayList(); + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + int curStart = 0; + for(int iii = 1; iii < isActiveList.size(); iii++ ) { + final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; + if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { + returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); + isActive = thisStatus; + curStart = iii; + } + } + + if( curStart != isActiveList.size()-1 ) { + returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); + } + return returnList; + } + } + + /** + * Helper routine to create an active region based on our current start and end offsets + * @param isActive should the region be active? + * @param curStart offset (0-based) from the start of this region + * @param curEnd offset (0-based) from the start of this region + * @param activeRegionExtension + * @return a fully initialized ActiveRegion with the above properties + */ + private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); + } +} From e440c9be987868a84a7bf152fc88db1b1854048e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Mar 2012 14:59:31 -0400 Subject: [PATCH 07/10] Clean up logic for adding reads to ART cache -- No longer has duplicate code --- .../traversals/TraverseActiveRegions.java | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index c0fc78e3c..ff376fcd2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -94,19 +94,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - // add to work queue - if( walker.activeRegionOutStream == null ) { - workQueue.addAll( activeRegions ); - } else { // Just want to output the active regions to a file, not actually process them - for( final ActiveRegion activeRegion : activeRegions ) { - if( activeRegion.isActive ) { - walker.activeRegionOutStream.println( activeRegion.getLocation() ); - } - } - } + // add active regions to queue of regions to process + workQueue.addAll( activeRegions ); + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); // now go and process all of the active regions sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); @@ -170,6 +155,29 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them while( workQueue.peek() != null ) { final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); From 7c5cdb51c222a2b297e5a10724675394de5826fb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 14 Mar 2012 17:26:06 -0400 Subject: [PATCH 08/10] UnitTests for ActivityProfile and minor ART cleanup -- TODO for ryan -- there are bugs in ActivityProfile code that I cannot fix right now :-( -- UnitTesting framework for ActivityProfile -- needs to be expanded -- Minor helper functions for ActiveRegion to help with unit tests --- .../traversals/TraverseActiveRegions.java | 1 + .../utils/activeregion/ActiveRegion.java | 14 ++ .../utils/activeregion/ActivityProfile.java | 8 +- .../activeregion/ActivityProfileUnitTest.java | 149 ++++++++++++++++++ 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index ff376fcd2..f9a185650 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -179,6 +179,7 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum, final int minStart, final String currentContig ) { // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here while( workQueue.peek() != null ) { final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index c2e69ee2d..37822dc84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -34,6 +34,11 @@ public class ActiveRegion implements HasGenomeLocation { fullExtentReferenceLoc = extendedLoc; } + @Override + public String toString() { + return "ActiveRegion " + activeRegionLoc.toString(); + } + // add each read to the bin and extend the reference genome activeRegionLoc if needed public void add( final GATKSAMRecord read ) { fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); @@ -78,4 +83,13 @@ public class ActiveRegion implements HasGenomeLocation { public void clearReads() { reads.clear(); } public void remove( final GATKSAMRecord read ) { reads.remove( read ); } public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } + + public boolean equalExceptReads(final ActiveRegion other) { + if ( ! activeRegionLoc.equals(other.activeRegionLoc)) return false; + if ( isActive != other.isActive ) return false; + if ( genomeLocParser != other.genomeLocParser ) return false; + if ( extension != other.extension ) return false; + if ( ! extendedLoc.equals(other.extendedLoc) ) return false; + return true; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 14ab97ee4..79b17cdba 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.activeregion; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Arrays; @@ -45,6 +46,8 @@ public class ActivityProfile { GenomeLoc regionStartLoc = null; final List isActiveList; + private GenomeLoc lastLoc = null; + // todo -- add upfront the start and stop of the intervals // todo -- check that no regions are unexpectedly missing // todo -- add unit tests @@ -61,7 +64,10 @@ public class ActivityProfile { } public void add(final GenomeLoc loc, final double score) { - // todo -- test for validity + if ( loc.size() != 1 ) + throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" ); + if ( lastLoc != null && loc.getStart() != lastLoc.getStop() + 1 ) + throw new ReviewedStingException("Bad add call to ActivityProfile: lastLoc added " + lastLoc + " and next is " + loc); isActiveList.add(score); if( regionStartLoc == null ) { regionStartLoc = loc; diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java new file mode 100644 index 000000000..e6d0322c0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.recalibration.QualQuantizer; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class ActivityProfileUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private GenomeLoc startLoc; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + startLoc = genomeLocParser.createGenomeLoc("chr1", 1, 1, 100); + } + + // -------------------------------------------------------------------------------- + // + // Basic tests Provider + // + // -------------------------------------------------------------------------------- + + private class BasicActivityProfileTestProvider extends TestDataProvider { + List probs; + List expectedRegions; + int extension = 0; + GenomeLoc regionStart = startLoc; + + public BasicActivityProfileTestProvider(final List probs, final List expectedRegions) { + super(BasicActivityProfileTestProvider.class); + this.probs = probs; + this.expectedRegions = expectedRegions; + setName(getName()); + } + + public BasicActivityProfileTestProvider(final List probs, boolean startActive, int ... startsAndStops) { + super(BasicActivityProfileTestProvider.class); + this.probs = probs; + this.expectedRegions = toRegions(startActive, startsAndStops); + setName(getName()); + } + + private String getName() { + return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions)); + } + + private List toRegions(boolean isActive, int[] startsAndStops) { + List l = new ArrayList(); + for ( int i = 0; i < startsAndStops.length - 1; i++) { + int start = regionStart.getStart() + startsAndStops[i]; + int end = regionStart.getStart() + startsAndStops[i+1] - 1; + GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end); + ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension); + l.add(r); + isActive = ! isActive; + } + return l; + } + } + + @DataProvider(name = "BasicActivityProfileTestProvider") + public Object[][] makeQualIntervalTestProvider() { + new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1); + // TODO -- RYAN THESE ALL EXHIBIT AN OFF-BY-ONE ERROR. SORRY I HAVE TO GO BUT I CANNOT FIX NOW + //new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2); + //new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2); + //new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); + new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + + return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); + } + + @Test(dataProvider = "BasicActivityProfileTestProvider") + public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { + ActivityProfile profile = new ActivityProfile(genomeLocParser, false); + + Assert.assertEquals(profile.parser, genomeLocParser); + + for ( int i = 0; i < cfg.probs.size(); i++ ) { + double p = cfg.probs.get(i); + GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); + profile.add(loc, p); + } + Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); + + Assert.assertEquals(profile.size(), cfg.probs.size()); + Assert.assertEquals(profile.isActiveList, cfg.probs); + + assertRegionsAreEqual(profile.createActiveRegions(0), cfg.expectedRegions); + } + + private void assertRegionsAreEqual(List actual, List expected) { + Assert.assertEquals(actual.size(), expected.size()); + for ( int i = 0; i < actual.size(); i++ ) { + Assert.assertTrue(actual.get(i).equalExceptReads(expected.get(i))); + } + } + + // todo -- test extensions +} \ No newline at end of file