From 8d9b0f1bd57d4d569d6ec27e7d372a923e11b9bd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 17 Jan 2013 17:31:56 -0500 Subject: [PATCH] Restructure ActivityProfiler into root class ActivityProfile and derived class BandPassActivityProfile -- Required before I jump in an redo the entire activity profile so it's can be run imcrementally -- This restructuring makes the differences between the two functionalities clearer, as almost all of the functionality is in the base class. The only functionality provided by the BandPassActivityProfile is isolated to a finalizeProfile function overloaded from the base class. -- Renamed ActivityProfileResult to ActivityProfileState, as this is a clearer indication of its actual functionality. Almost all of the misc. walker changes are due to this name update -- Code cleanup and docs for TraverseActiveRegions -- Expanded unit tests for ActivityProfile and ActivityProfileState --- .../targets/FindCoveredIntervals.java | 6 +- .../haplotypecaller/HaplotypeCaller.java | 13 +- .../traversals/TraverseActiveRegions.java | 95 +++++---- .../gatk/walkers/ActiveRegionWalker.java | 4 +- .../utils/activeregion/ActivityProfile.java | 183 ++++++++++-------- ...eResult.java => ActivityProfileState.java} | 22 +-- .../activeregion/BandPassActivityProfile.java | 84 ++++++++ .../traversals/DummyActiveRegionWalker.java | 6 +- ...java => ActivityProfileStateUnitTest.java} | 20 +- .../activeregion/ActivityProfileUnitTest.java | 55 ++++-- 10 files changed, 308 insertions(+), 180 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/activeregion/{ActivityProfileResult.java => ActivityProfileState.java} (77%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java rename public/java/test/org/broadinstitute/sting/utils/activeregion/{ActivityProfileResultTest.java => ActivityProfileStateUnitTest.java} (79%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 08de5a6aa..74ff77e4b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -57,7 +57,7 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; @@ -74,12 +74,12 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Override // Look to see if the region has sufficient coverage - public ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); + return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 26f2560b7..9bb04421c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -71,7 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; @@ -382,7 +381,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public ActivityProfileResult isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { @@ -391,15 +390,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { - return new ActivityProfileResult(ref.getLocus(), 1.0); + return new ActivityProfileState(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } - if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); } + if( context == null ) { return new ActivityProfileState(ref.getLocus(), 0.0); } final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied noCall.add(Allele.NO_CALL); @@ -436,7 +435,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a7e4d7649..de0bfd1f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -41,7 +41,8 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; +import org.broadinstitute.sting.utils.activeregion.BandPassActivityProfile; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -71,6 +72,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine myReads = new LinkedList(); private GenomeLoc spanOfLastReadSeen = null; + @Override + public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { + super.initialize(engine, walker, progressMeter); + + final ActiveRegionWalker arWalker = (ActiveRegionWalker)walker; + if ( arWalker.wantsExtendedReads() && ! arWalker.wantsNonPrimaryReads() ) { + throw new IllegalArgumentException("Active region walker " + arWalker + " requested extended events but not " + + "non-primary reads, an inconsistent state. Please modify the walker"); + } + + activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + walkerHasPresetRegions = arWalker.hasPresetActiveRegions(); + } + + // ------------------------------------------------------------------------------------- + // + // Utility functions + // + // ------------------------------------------------------------------------------------- + protected int getActiveRegionExtension() { return activeRegionExtension; } @@ -97,19 +120,6 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + + // ------------------------------------------------------------------------------------- + // + // Working with ActivityProfiles and Active Regions + // + // ------------------------------------------------------------------------------------- + /** * Take the individual isActive calls and integrate them into contiguous active regions and * add these blocks of work to the work queue @@ -133,28 +159,26 @@ public class TraverseActiveRegions extends TraversalEngine walker, + protected final ActivityProfileState walkerActiveProb(final ActiveRegionWalker walker, final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext locus, final GenomeLoc location) { - if ( walker.hasPresetActiveRegions() ) { - return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); + if ( walkerHasPresetRegions ) { + return new ActivityProfileState(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); } else { return walker.isActive( tracker, refContext, locus ); } } - protected ReferenceOrderedView getReferenceOrderedView(final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - final LocusView locusView) { - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - return new ManagingReferenceOrderedView( dataProvider ); + private ActivityProfile makeNewActivityProfile() { + if ( walkerHasPresetRegions ) + return new ActivityProfile(engine.getGenomeLocParser()); else - return (RodLocusView)locusView; + return new BandPassActivityProfile(engine.getGenomeLocParser()); } /** @@ -171,6 +195,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = makeNewActivityProfile(); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); @@ -245,9 +269,8 @@ public class TraverseActiveRegions extends TraversalEngine reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); for( final GATKSAMRecord read : reads ) { if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index f937c2458..820100f7f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -114,7 +114,7 @@ public abstract class ActiveRegionWalker extends Walker= 0.0", "result.isActiveProb <= 1.0"}) - public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); // Map over the ActiveRegion public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 909d99424..fd05ddd7b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -1,35 +1,34 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; import java.util.ArrayList; import java.util.Collections; @@ -43,36 +42,37 @@ import java.util.List; * @since Date created */ public class ActivityProfile { - final GenomeLocParser parser; - final boolean presetRegions; - GenomeLoc regionStartLoc = null; - GenomeLoc regionStopLoc = null; - final List isActiveList; - private static final int FILTER_SIZE = 80; - private static final double[] GaussianKernel; + private final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author - static { - GaussianKernel = new double[2*FILTER_SIZE + 1]; - for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { - GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 55.0, iii); - } + protected final List isActiveList; + protected final GenomeLocParser parser; + + protected GenomeLoc regionStartLoc = null; + protected GenomeLoc regionStopLoc = null; + + public ActivityProfile(final GenomeLocParser parser) { + this(parser, new ArrayList(), null); } - // todo -- add upfront the start and stop of the intervals - // todo -- check that no regions are unexpectedly missing - // todo -- add unit tests - // TODO -- own preset regions - public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) { - this(parser, presetRegions, new ArrayList(), null); - } - - protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List isActiveList, final GenomeLoc regionStartLoc) { + protected ActivityProfile(final GenomeLocParser parser, final List isActiveList, final GenomeLoc regionStartLoc) { this.parser = parser; - this.presetRegions = presetRegions; this.isActiveList = isActiveList; this.regionStartLoc = regionStartLoc; } + /** + * Create a profile of the same class as this object containing just the provided isActiveList + * + * Used by clients to create derived activity profiles (such as ones without the starting X + * sites because they've been removed in an ActiveRegion) of the same class. + * + * @param isActiveList the active results list to use in the derived instance + * @return a freshly allocated data set + */ + protected ActivityProfile createDerivedProfile(final List isActiveList) { + return new ActivityProfile(parser, isActiveList, regionStartLoc); + } + @Override public String toString() { return "ActivityProfile{" + @@ -82,14 +82,14 @@ public class ActivityProfile { } /** - * Add the next ActivityProfileResult to this profile. + * Add the next ActivityProfileState to this profile. * * Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown * - * @param result a well-formed ActivityProfileResult result to incorporate into this profile + * @param result a well-formed ActivityProfileState result to incorporate into this profile */ @Requires("result != null") - public void add(final ActivityProfileResult result) { + public void add(final ActivityProfileState result) { final GenomeLoc loc = result.getLoc(); if ( regionStartLoc == null ) { @@ -104,31 +104,67 @@ public class ActivityProfile { isActiveList.add(result); } + /** + * How many profile results are in this profile? + * @return the number of profile results + */ + @Ensures("result >= 0") public int size() { return isActiveList.size(); } + /** + * Is this profile empty? + * @return true if the profile is empty + */ + @Ensures("isEmpty() == (size() == 0)") public boolean isEmpty() { return isActiveList.isEmpty(); } - public boolean hasPresetRegions() { - return presetRegions; + /** + * Get the list of active profile results in this object + * @return a non-null, ordered list of active profile results + */ + @Ensures("result != null") + protected List getActiveList() { + return isActiveList; } /** - * Band pass this ActivityProfile, producing a new profile that's band pass filtered - * @return a new ActivityProfile that's the band-pass filtered version of this profile + * Finalize the probabilities in this activity profile, preparing it for a future + * call to createActiveRegions. This function returns a new profile with cleaned + * up activity estimates. + * + * This code looks at the current list of states, cleans them up, and then returns + * a newly allocated ActivityProfile + * + * @return a newly allocated ActivityProfile based on the current state of this + * profile, but that has been "finalized" as required by the profile implementation */ - public ActivityProfile bandPassFilter() { - final double[] activeProbArray = new double[isActiveList.size()]; + public ActivityProfile finalizeProfile() { int iii = 0; - for( final ActivityProfileResult result : isActiveList ) { + for( final double prob : finalizeProbabilities() ) { + final ActivityProfileState result = isActiveList.get(iii++); + result.isActiveProb = prob; + result.resultState = ActivityProfileState.Type.NONE; + result.resultValue = null; + } + + return createDerivedProfile(isActiveList); + } + + public double[] finalizeProbabilities() { + final double[] activeProbArray = new double[isActiveList.size()]; + + int iii = 0; + for( final ActivityProfileState result : isActiveList ) { activeProbArray[iii++] = result.isActiveProb; } + iii = 0; - for( final ActivityProfileResult result : isActiveList ) { - if( result.resultState.equals(ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS) ) { // special code to deal with the problem that high quality soft clipped bases aren't added to pileups + for( final ActivityProfileState result : isActiveList ) { + if( result.resultState.equals(ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS) ) { // special code to deal with the problem that high quality soft clipped bases aren't added to pileups final int numHQClips = result.resultValue.intValue(); for( int jjj = Math.max(0, iii - numHQClips); jjj < Math.min(activeProbArray.length, iii+numHQClips); jjj++ ) { activeProbArray[jjj] = Math.max(activeProbArray[jjj], activeProbArray[iii]); @@ -137,29 +173,7 @@ public class ActivityProfile { iii++; } - final double[] filteredProbArray; - if( !presetRegions ) { - // if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray - filteredProbArray = new double[activeProbArray.length]; - for( iii = 0; iii < activeProbArray.length; iii++ ) { - final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); - final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); - filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); - } - } else { - // otherwise we simply use the activeProbArray directly - filteredProbArray = activeProbArray; - } - - iii = 0; - for( final double prob : filteredProbArray ) { - final ActivityProfileResult result = isActiveList.get(iii++); - result.isActiveProb = prob; - result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE; - result.resultValue = null; - } - - return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc); + return activeProbArray; } /** @@ -168,7 +182,6 @@ public class ActivityProfile { * @return the list of active regions */ public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { - final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author final ArrayList returnList = new ArrayList(); if( isActiveList.size() == 0 ) { @@ -203,11 +216,11 @@ public class ActivityProfile { * @param activeRegionExtension the amount of margin overlap in the active region * @return a fully initialized ActiveRegion with the above properties */ - private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { + private List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); } - private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { + private List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { if( !isActive || curEnd - curStart < maxRegionSize ) { final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java similarity index 77% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java rename to public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java index bf2636465..38e89b605 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java @@ -34,40 +34,40 @@ import org.broadinstitute.sting.utils.GenomeLoc; * User: rpoplin * Date: 7/27/12 */ -public class ActivityProfileResult { +public class ActivityProfileState { private GenomeLoc loc; public double isActiveProb; - public ActivityProfileResultState resultState; + public Type resultState; public Number resultValue; - public enum ActivityProfileResultState { + public enum Type { NONE, HIGH_QUALITY_SOFT_CLIPS } /** - * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb + * Create a new ActivityProfileState at loc with probability of being active of isActiveProb * * @param loc the position of the result profile (for debugging purposes) * @param isActiveProb the probability of being active (between 0 and 1) */ @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) - public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) { - this(loc, isActiveProb, ActivityProfileResultState.NONE, null); + public ActivityProfileState(final GenomeLoc loc, final double isActiveProb) { + this(loc, isActiveProb, Type.NONE, null); } /** - * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some + * Create a new ActivityProfileState at loc with probability of being active of isActiveProb that maintains some * information about the result state and value (TODO RYAN -- what do these mean?) * * @param loc the position of the result profile (for debugging purposes) * @param isActiveProb the probability of being active (between 0 and 1) */ @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) - public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + public ActivityProfileState(final GenomeLoc loc, final double isActiveProb, final Type resultState, final Number resultValue) { // make sure the location of that activity profile is 1 if ( loc.size() != 1 ) - throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc); + throw new IllegalArgumentException("Location for an ActivityProfileState must have to size 1 bp but saw " + loc); this.loc = loc; this.isActiveProb = isActiveProb; @@ -76,7 +76,7 @@ public class ActivityProfileResult { } /** - * Get the genome loc associated with the ActivityProfileResult + * Get the genome loc associated with the ActivityProfileState * @return the location of this result */ @Ensures("result != null") @@ -86,7 +86,7 @@ public class ActivityProfileResult { @Override public String toString() { - return "ActivityProfileResult{" + + return "ActivityProfileState{" + "loc=" + loc + ", isActiveProb=" + isActiveProb + ", resultState=" + resultState + diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java new file mode 100644 index 000000000..cef700419 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java @@ -0,0 +1,84 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.activeregion; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.ArrayList; +import java.util.List; + +/** + * + * + * @author Mark DePristo + * @since 2011 + */ +public class BandPassActivityProfile extends ActivityProfile { + private static final int FILTER_SIZE = 80; + private static final double[] GaussianKernel; + + static { + GaussianKernel = new double[2*FILTER_SIZE + 1]; + for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { + GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 55.0, iii); + } + } + + public BandPassActivityProfile(final GenomeLocParser parser) { + this(parser, new ArrayList(), null); + } + + public BandPassActivityProfile(final GenomeLocParser parser, final List isActiveList, final GenomeLoc regionStartLoc) { + super(parser, isActiveList, regionStartLoc); + } + + @Override + protected ActivityProfile createDerivedProfile(List isActiveList) { + return new BandPassActivityProfile(parser, isActiveList, regionStartLoc); + } + + /** + * Band pass the probabilities in the ActivityProfile, producing a new profile that's band pass filtered + * @return a new double[] that's the band-pass filtered version of this profile + */ + @Override + public double[] finalizeProbabilities() { + final double[] activeProbArray = super.finalizeProbabilities(); + final double[] bandPassProbArray = new double[activeProbArray.length]; + + // apply the band pass filter for activeProbArray into filteredProbArray + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); + final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); + bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + } + + return bandPassProbArray; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java index 76be54d72..f09a4b3e8 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import java.util.*; @@ -80,10 +80,10 @@ class DummyActiveRegionWalker extends ActiveRegionWalker { } @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { isActiveCalls.add(ref.getLocus()); final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; - return new ActivityProfileResult(ref.getLocus(), p); + return new ActivityProfileState(ref.getLocus(), p); } @Override diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileResultTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java similarity index 79% rename from public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileResultTest.java rename to public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java index d131666fa..019cf82da 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileResultTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java @@ -25,23 +25,17 @@ package org.broadinstitute.sting.utils.activeregion; -import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.io.FileNotFoundException; import java.util.Arrays; -import java.util.EnumSet; import java.util.LinkedList; import java.util.List; @@ -52,7 +46,7 @@ import java.util.List; * Time: 2:30 PM * To change this template use File | Settings | File Templates. */ -public class ActivityProfileResultTest { +public class ActivityProfileStateUnitTest { private GenomeLocParser genomeLocParser; @BeforeClass @@ -71,7 +65,7 @@ public class ActivityProfileResultTest { genomeLocParser.createGenomeLoc(chr, 10, 10), genomeLocParser.createGenomeLoc(chr, 100, 100) )) { for ( final double prob : Arrays.asList(0.0, 0.5, 1.0) ) { - for ( final ActivityProfileResult.ActivityProfileResultState state : ActivityProfileResult.ActivityProfileResultState.values() ) { + for ( final ActivityProfileState.Type state : ActivityProfileState.Type.values() ) { for ( final Number value : Arrays.asList(1, 2, 4) ) { tests.add(new Object[]{ loc, prob, state, value}); } @@ -84,15 +78,15 @@ public class ActivityProfileResultTest { } @Test(dataProvider = "ActiveProfileResultProvider") - public void testActiveProfileResultProvider(GenomeLoc loc, final double prob, ActivityProfileResult.ActivityProfileResultState maybeState, final Number maybeNumber) { - final ActivityProfileResult apr = maybeState == null - ? new ActivityProfileResult(loc, prob) - : new ActivityProfileResult(loc, prob, maybeState, maybeNumber); + public void testActiveProfileResultProvider(GenomeLoc loc, final double prob, ActivityProfileState.Type maybeState, final Number maybeNumber) { + final ActivityProfileState apr = maybeState == null + ? new ActivityProfileState(loc, prob) + : new ActivityProfileState(loc, prob, maybeState, maybeNumber); Assert.assertEquals(apr.getLoc(), loc); Assert.assertNotNull(apr.toString()); Assert.assertEquals(apr.isActiveProb, prob); - Assert.assertEquals(apr.resultState, maybeState == null ? ActivityProfileResult.ActivityProfileResultState.NONE : maybeState); + Assert.assertEquals(apr.resultState, maybeState == null ? ActivityProfileState.Type.NONE : maybeState); Assert.assertEquals(apr.resultValue, maybeState == null ? null : maybeNumber); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index ff27037d3..430e0b5c6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -42,9 +42,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; public class ActivityProfileUnitTest extends BaseTest { @@ -70,23 +68,26 @@ public class ActivityProfileUnitTest extends BaseTest { List expectedRegions; int extension = 0; GenomeLoc regionStart = startLoc; + final ProfileType type; - public BasicActivityProfileTestProvider(final List probs, final List expectedRegions) { - super(BasicActivityProfileTestProvider.class); - this.probs = probs; - this.expectedRegions = expectedRegions; - setName(getName()); - } - - public BasicActivityProfileTestProvider(final List probs, boolean startActive, int ... startsAndStops) { + public BasicActivityProfileTestProvider(final ProfileType type, final List probs, boolean startActive, int ... startsAndStops) { super(BasicActivityProfileTestProvider.class); + this.type = type; this.probs = probs; this.expectedRegions = toRegions(startActive, startsAndStops); setName(getName()); } private String getName() { - return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions)); + return String.format("type=%s probs=%s expectedRegions=%s", type, Utils.join(",", probs), Utils.join(",", expectedRegions)); + } + + public ActivityProfile makeProfile() { + switch ( type ) { + case Base: return new ActivityProfile(genomeLocParser); + case BandPass: return new BandPassActivityProfile(genomeLocParser); + default: throw new IllegalStateException(type.toString()); + } } private List toRegions(boolean isActive, int[] startsAndStops) { @@ -103,27 +104,36 @@ public class ActivityProfileUnitTest extends BaseTest { } } + private enum ProfileType { + Base, BandPass + } + @DataProvider(name = "BasicActivityProfileTestProvider") public Object[][] makeQualIntervalTestProvider() { - new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2); - new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + for ( final ProfileType type : ProfileType.values() ) { + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + } return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); } @Test(dataProvider = "BasicActivityProfileTestProvider") public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { - ActivityProfile profile = new ActivityProfile(genomeLocParser, false); + ActivityProfile profile = cfg.makeProfile(); + + Assert.assertTrue(profile.isEmpty()); Assert.assertEquals(profile.parser, genomeLocParser); for ( int i = 0; i < cfg.probs.size(); i++ ) { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); - profile.add(new ActivityProfileResult(loc, p)); + profile.add(new ActivityProfileState(loc, p)); + Assert.assertFalse(profile.isEmpty()); } Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); @@ -131,6 +141,11 @@ public class ActivityProfileUnitTest extends BaseTest { assertProbsAreEqual(profile.isActiveList, cfg.probs); assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); + + Assert.assertEquals(profile.createDerivedProfile(profile.isActiveList).getClass(), profile.getClass()); + + final List empty = new LinkedList(); + Assert.assertEquals(profile.createDerivedProfile(empty).size(), 0); } private void assertRegionsAreEqual(List actual, List expected) { @@ -140,7 +155,7 @@ public class ActivityProfileUnitTest extends BaseTest { } } - private void assertProbsAreEqual(List actual, List expected) { + private void assertProbsAreEqual(List actual, List expected) { Assert.assertEquals(actual.size(), expected.size()); for ( int i = 0; i < actual.size(); i++ ) { Assert.assertEquals(actual.get(i).isActiveProb, expected.get(i));