From 1444cd753bfcdb0084afdadae3e7c45d257d8f91 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 30 Oct 2012 16:58:55 -0400 Subject: [PATCH] Bugfix for GSA-647 HaplotypeCaller misses good variant because the active region doesn't trigger for an exome -- The logic for determining active regions was a bit broken in the HC when intervals were used in the system -- TraverseActiveRegions now uses the AllLocus view, since we always want to see all reference sites, not just those covered. Simplifies logic of TAR -- Non-overlapping intervals are always treated as separate objects for determing active / inactive state. This means that each exon will stand on its own when deciding if it should be active or inactive -- Misc. cleanup, docs of some TAR infrastructure to make it safer and easier to debug in the future. -- Committing the SingleExomeCalling script that I used to find this problem, and will continue to use in evaluating calling of a single exome with the HC -- Make sure to get all of the reads into the set of potentially active reads, even for genomic locations that themselves don't overlap the engine intervals but may have reads that overlap the regions -- Remove excessively expensive calls to check bases are upper cased in ReferenceContext -- Update md5s after a lot of manual review and discussion with Ryan --- .../haplotypecaller/HaplotypeCaller.java | 16 +- .../HaplotypeCallerIntegrationTest.java | 6 +- .../sting/gatk/contexts/ReferenceContext.java | 10 +- .../traversals/TraverseActiveRegions.java | 199 +++++++++--------- .../targets/FindCoveredIntervals.java | 2 +- .../utils/activeregion/ActiveRegion.java | 20 +- .../utils/activeregion/ActivityProfile.java | 54 ++++- .../activeregion/ActivityProfileResult.java | 52 ++++- .../activeregion/ActivityProfileUnitTest.java | 2 +- 9 files changed, 226 insertions(+), 135 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5aba23faa..a185ba6af 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -212,7 +214,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private VariantAnnotatorEngine annotationEngine; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 900; @@ -324,15 +326,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { - return new ActivityProfileResult(1.0); + return new ActivityProfileResult(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } - if( context == null ) { return new ActivityProfileResult(0.0); } + if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); } final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied noCall.add(Allele.NO_CALL); @@ -369,7 +371,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 86f3748ce..d00f5b61d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d86fae2d1b504b422b7b0cfbbdecc2c4"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -69,8 +69,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939")); + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f6326adfdf5bc147626b30a89ce06d56")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index c8bf1e3e8..34627b973 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -78,7 +78,7 @@ public class ReferenceContext { * * @return */ - @Ensures({"result != null", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null"}) public byte[] getBases(); } @@ -143,6 +143,9 @@ public class ReferenceContext { private void fetchBasesFromProvider() { if ( basesCache == null ) { basesCache = basesProvider.getBases(); + + // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time + assert BaseUtils.isUpperCase(basesCache); } } @@ -172,7 +175,6 @@ public class ReferenceContext { * Get the base at the given locus. * @return The base at the given locus from the reference. */ - @Ensures("BaseUtils.isUpperCase(result)") public byte getBase() { return getBases()[(locus.getStart() - window.getStart())]; } @@ -182,7 +184,7 @@ public class ReferenceContext { * @return All bases available. If the window is of size [0,0], the array will * contain only the base at the given locus. */ - @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null", "result.length > 0"}) public byte[] getBases() { fetchBasesFromProvider(); return basesCache; @@ -191,7 +193,7 @@ public class ReferenceContext { /** * All the bases in the window from the current base forward to the end of the window. */ - @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null", "result.length > 0"}) public byte[] getForwardBases() { final byte[] bases = getBases(); final int mid = locus.getStart() - window.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 5d38df0f5..a2c37944a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; @@ -46,99 +45,127 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); - if(prevLoc != null) { - // fill in the active / inactive labels from the stop of the previous location to the start of this location - // TODO refactor to separate function - for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { - final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); - if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 )); - } - } + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); } - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - if( initialIntervals == null || initialIntervals.overlaps( location ) ) { - profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); - } - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - prevLoc = location; - - printProgress(locus.getLocation()); + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); } - updateCumulativeMetrics(dataProvider.getShard()); + // skip this location -- it's not part of our engine intervals + // TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow + if ( outsideEngineIntervals(location) ) + continue; - // Take the individual isActive calls and integrate them into contiguous active regions and - // add these blocks of work to the work queue - // band-pass filter the list of isActive probabilities and turn into active regions - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + dataProvider.getShard().getReadMetrics().incrementNumIterations(); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + return sum; } + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + // -------------------------------------------------------------------------------- // @@ -150,7 +177,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, final LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } - /** * Special function called in LinearMicroScheduler to empty out the work queue. * Ugly for now but will be cleaned up when we push this functionality more into the engine diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index e17c6cdb7..85b7159e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1)); + return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index decc54d47..0d12d53cc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.utils.activeregion; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.util.StringUtil; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation { public ArrayList getReads() { return reads; } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, extendedLoc ); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, fullExtentReferenceLoc ); } - private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + @Requires("referenceReader.isUppercasingBases()") + private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); - StringUtil.toUpperCase(reference); return reference; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 73f3cc487..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,11 +24,11 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Collections; @@ -45,6 +45,7 @@ public class ActivityProfile { final GenomeLocParser parser; final boolean presetRegions; GenomeLoc regionStartLoc = null; + GenomeLoc regionStopLoc = null; final List isActiveList; private static final int FILTER_SIZE = 80; private static final double[] GaussianKernel; @@ -71,19 +72,49 @@ public class ActivityProfile { this.regionStartLoc = regionStartLoc; } - public void add(final GenomeLoc loc, final ActivityProfileResult result) { - if ( loc.size() != 1 ) - throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" ); - isActiveList.add(result); - if( regionStartLoc == null ) { + @Override + public String toString() { + return "ActivityProfile{" + + "start=" + regionStartLoc + + ", stop=" + regionStopLoc + + '}'; + } + + /** + * Add the next ActivityProfileResult to this profile. + * + * Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown + * + * @param result a well-formed ActivityProfileResult result to incorporate into this profile + */ + @Requires("result != null") + public void add(final ActivityProfileResult result) { + final GenomeLoc loc = result.getLoc(); + + if ( regionStartLoc == null ) { regionStartLoc = loc; + regionStopLoc = loc; + } else { + if ( regionStopLoc.getStart() != loc.getStart() - 1 ) + throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); + regionStopLoc = loc; } + + isActiveList.add(result); } public int size() { return isActiveList.size(); } + public boolean isEmpty() { + return isActiveList.isEmpty(); + } + + public boolean hasPresetRegions() { + return presetRegions; + } + /** * Band pass this ActivityProfile, producing a new profile that's band pass filtered * @return a new ActivityProfile that's the band-pass filtered version of this profile @@ -104,14 +135,21 @@ public class ActivityProfile { } iii++; } - final double[] filteredProbArray = new double[activeProbArray.length]; + + final double[] filteredProbArray; if( !presetRegions ) { + // if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray + filteredProbArray = new double[activeProbArray.length]; for( iii = 0; iii < activeProbArray.length; iii++ ) { final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } + } else { + // otherwise we simply use the activeProbArray directly + filteredProbArray = activeProbArray; } + iii = 0; for( final double prob : filteredProbArray ) { final ActivityProfileResult result = isActiveList.get(iii++); @@ -119,6 +157,7 @@ public class ActivityProfile { result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE; result.resultValue = null; } + return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc); } @@ -166,6 +205,7 @@ public class ActivityProfile { private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { if( !isActive || curEnd - curStart < maxRegionSize ) { final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java index 8dc29aa3c..273c2e785 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created with IntelliJ IDEA. * User: rpoplin * Date: 7/27/12 */ - public class ActivityProfileResult { + private GenomeLoc loc; public double isActiveProb; public ActivityProfileResultState resultState; public Number resultValue; @@ -16,16 +20,52 @@ public class ActivityProfileResult { HIGH_QUALITY_SOFT_CLIPS } - public ActivityProfileResult( final double isActiveProb ) { - this.isActiveProb = isActiveProb; - this.resultState = ActivityProfileResultState.NONE; - this.resultValue = null; + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) { + this(loc, isActiveProb, ActivityProfileResultState.NONE, null); } - public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some + * information about the result state and value (TODO RYAN -- what do these mean?) + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + // make sure the location of that activity profile is 1 + if ( loc.size() != 1 ) + throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc); + + this.loc = loc; this.isActiveProb = isActiveProb; this.resultState = resultState; this.resultValue = resultValue; } + /** + * Get the genome loc associated with the ActivityProfileResult + * @return the location of this result + */ + @Ensures("result != null") + public GenomeLoc getLoc() { + return loc; + } + + @Override + public String toString() { + return "ActivityProfileResult{" + + "loc=" + loc + + ", isActiveProb=" + isActiveProb + + ", resultState=" + resultState + + ", resultValue=" + resultValue + + '}'; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index f7c564c74..57dd19888 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest { for ( int i = 0; i < cfg.probs.size(); i++ ) { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); - profile.add(loc, new ActivityProfileResult(p)); + profile.add(new ActivityProfileResult(loc, p)); } Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));