diff --git a/licensing/private_license.txt b/licensing/private_license.txt deleted file mode 120000 index d83474e7a..000000000 --- a/licensing/private_license.txt +++ /dev/null @@ -1 +0,0 @@ -protected_license.txt \ No newline at end of file diff --git a/licensing/private_license.txt b/licensing/private_license.txt new file mode 100644 index 000000000..2f40c5089 --- /dev/null +++ b/licensing/private_license.txt @@ -0,0 +1,43 @@ + By downloading the PROGRAM you agree to the following terms of use: + + BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY + + This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). + + WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and + WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. + NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: + + 1. DEFINITIONS + 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. + + 2. LICENSE + 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. + The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. + 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. + 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. + + 3. OWNERSHIP OF INTELLECTUAL PROPERTY + LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. + Copyright 2012 Broad Institute, Inc. + Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. + LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. + + 4. INDEMNIFICATION + LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. + + 5. NO REPRESENTATIONS OR WARRANTIES + THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. + IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. + + 6. ASSIGNMENT + This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. + + 7. MISCELLANEOUS + 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. + 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. + 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. + 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. + 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. + 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. + 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 08de5a6aa..74ff77e4b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -57,7 +57,7 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.io.PrintStream; @@ -74,12 +74,12 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Override // Look to see if the region has sufficient coverage - public ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); + return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 26f2560b7..9bb04421c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -71,7 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.*; @@ -382,7 +381,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public ActivityProfileResult isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { + public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { @@ -391,15 +390,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { - return new ActivityProfileResult(ref.getLocus(), 1.0); + return new ActivityProfileState(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileState( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } - if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); } + if( context == null ) { return new ActivityProfileState(ref.getLocus(), 0.0); } final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied noCall.add(Allele.NO_CALL); @@ -436,7 +435,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileState( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileState.Type.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 939b9873c..41f9ab680 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); + HCTest(CEUTRIO_BAM, "", "c142bc73447c72286ca48f4a4966d9b6"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); + HCTest(NA12878_BAM, "", "d172eb9447015ea50220c6947be145ea"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "c679ae7f04bdfda896b5c046d35e043c"); + "2c56ffc3b7fbbf154ae9ca355780a78f"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "8730a9ebaeecae913dca2fb5a0d4e946"); + "66bd513d25b691a5b0c5084924b4a308"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "d590c8d6d5e58d685401b65a23846893"); + "d0fcbfa2ccce0ca4a2e81f31dc43d79d"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "7e8a6ed62f866fc47c92af0e255ca180"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "602aabbbe710ac90b16e474c869e8a86"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("31db0a2d9eb07f86e0a89f0d97169072")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c23b589be3072027ff2da93067dbf549")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("add0f4f51969b7caeea99005a7ba1aa4")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("a612fe84dd7f80c4ad2d20d27fc6744e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); + Arrays.asList("0cb9132967fa9811e04f528be9f686dc")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("4e8121dd9dc90478f237bd6ae4d19920")); + Arrays.asList("36a90309dde1a325c274388e302ffaa5")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a7e4d7649..ac1c751ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.traversals; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -32,19 +33,18 @@ import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.activeregion.*; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; import java.util.*; /** @@ -69,8 +69,10 @@ import java.util.*; public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { protected final static Logger logger = Logger.getLogger(TraversalEngine.class); protected final static boolean DEBUG = false; + protected final static boolean LOG_READ_CARRYING = false; // set by the tranversal + private boolean walkerHasPresetRegions = false; private int activeRegionExtension = -1; private int maxRegionSize = -1; @@ -78,6 +80,46 @@ public class TraverseActiveRegions extends TraversalEngine myReads = new LinkedList(); private GenomeLoc spanOfLastReadSeen = null; + private ActivityProfile activityProfile = null; + int maxReadsInMemory = 0; + ActiveRegionWalker walker; + + /** + * Have the debugging output streams been initialized already? + * + * We have to do lazy initialization because when the initialize() function is called + * the streams aren't yet initialized in the GATK walker. + */ + private boolean streamsInitialized = false; + + @Override + public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { + super.initialize(engine, walker, progressMeter); + + this.walker = (ActiveRegionWalker)walker; + if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) { + throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " + + "non-primary reads, an inconsistent state. Please modify the walker"); + } + + activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + walkerHasPresetRegions = this.walker.hasPresetActiveRegions(); + + activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser()); + if ( walkerHasPresetRegions ) { + // we load all of the preset locations into the + for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) { + workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension())); + } + } + } + + // ------------------------------------------------------------------------------------- + // + // Utility functions + // + // ------------------------------------------------------------------------------------- protected int getActiveRegionExtension() { return activeRegionExtension; @@ -97,19 +139,6 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( getActiveRegionExtension(), getMaxRegionSize() )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - protected final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, - final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext locus, final GenomeLoc location) { - if ( walker.hasPresetActiveRegions() ) { - return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); - } else { - return walker.isActive( tracker, refContext, locus ); - } - } - protected ReferenceOrderedView getReferenceOrderedView(final ActiveRegionWalker walker, final LocusShardDataProvider dataProvider, final LocusView locusView) { @@ -157,19 +157,12 @@ public class TraverseActiveRegions extends TraversalEngine walker ) { - // Just want to output the active regions to a file, not actually process them - for( final ActiveRegion activeRegion : workQueue ) { - if( activeRegion.isActive ) { - walker.activeRegionOutStream.println( activeRegion.getLocation() ); - } - } - } + + // ------------------------------------------------------------------------------------- + // + // Actual traverse function + // + // ------------------------------------------------------------------------------------- /** * Did read appear in the last shard? @@ -189,7 +182,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, final LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + if ( LOG_READ_CARRYING || logger.isDebugEnabled() ) + logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); final LocusView locusView = new AllLocusView(dataProvider); - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + final ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead(); - // if we've moved onto a new contig, process all of the active regions - if ( onNewContig(dataProvider.getShard()) ) - sum = processActiveRegions(walker, sum, true); - - GenomeLoc prevLoc = null; while( locusView.hasNext() ) { final AlignmentContext locus = locusView.next(); final GenomeLoc location = locus.getLocation(); - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future + // get all of the new reads that appear in the current pileup, and them to our list of reads + // provided we haven't seen them before final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); for( final GATKSAMRecord read : reads ) { - if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { - if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); - } else { + if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); rememberLastReadLocation(read); myReads.add(read); @@ -263,10 +228,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); + return processActiveRegions((ActiveRegionWalker)walker, sum, true, true); } // ------------------------------------------------------------------------------------- @@ -360,8 +302,15 @@ public class TraverseActiveRegions extends TraversalEngine 0 ) + throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead()); + else { + return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart(); + } } /** @@ -385,31 +334,153 @@ public class TraverseActiveRegions extends TraversalEngine 0" + }) + private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) { + out.printf("#track graphType=%s%n", graphType); + out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns)); + + } + + /** + * Helper function to write out a IGV formatted line to out, at loc, with values + * + * http://www.broadinstitute.org/software/igv/IGV + * + * @param out a non-null PrintStream where we'll write our line + * @param loc the location of values + * @param featureName string name of this feature (see IGV format) + * @param values the floating point values to associate with loc and feature name in out + */ + @Requires({ + "out != null", + "loc != null", + "values.length > 0" + }) + private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) { + // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1 + out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName); + for ( final double value : values ) + out.print(String.format("\t%.3f", value)); + out.println(); + } + + /** + * Write out activity profile information, if requested by the walker + * + * @param states the states in the current activity profile + */ + @Requires("states != null") + private void writeActivityProfile(final List states) { + if ( walker.activityProfileOutStream != null ) { + initializeOutputStreamsIfNecessary(); + for ( final ActivityProfileState state : states ) { + printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", state.isActiveProb); + } + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param region the region we're currently operating on + */ + @Requires("region != null") + private void writeActiveRegion(final ActiveRegion region) { + if( walker.activeRegionOutStream != null ) { + initializeOutputStreamsIfNecessary(); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(), + "end-marker", 0.0); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(), + "size=" + region.getLocation().size(), region.isActive ? 1.0 : -1.0); + } + } + + // ------------------------------------------------------------------------------------- // // Functions to process active regions that are ready for map / reduce calls // // ------------------------------------------------------------------------------------- - private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); + /** + * Invoke the walker isActive function, and incorporate its result into the activity profile + * + * @param walker the walker we're running + * @param tracker the ref meta data tracker to pass on to the isActive function of walker + * @param refContext the refContext to pass on to the isActive function of walker + * @param locus the AlignmentContext to pass on to the isActive function of walker + */ + private void addIsActiveResult(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus) { + // must be called, even if we won't use the result, to satisfy walker contract + final ActivityProfileState state = walker.isActive( tracker, refContext, locus ); + if ( ! walkerHasPresetRegions ) { + activityProfile.add(state); } } - private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + */ + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) { + if ( ! walkerHasPresetRegions ) { + // We don't have preset regions, so we get our regions from the activity profile + final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMaxRegionSize(), flushActivityProfile); + workQueue.addAll(activeRegions); + if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + } + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here while( workQueue.peek() != null ) { final ActiveRegion activeRegion = workQueue.peek(); - if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { + if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead()); + writeActivityProfile(activeRegion.getSupportingStates()); + writeActiveRegion(activeRegion); sum = processActiveRegion( workQueue.remove(), sum, walker ); } else { break; @@ -419,7 +490,7 @@ public class TraverseActiveRegions extends TraversalEngine walker) { + private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { final Iterator liveReads = myReads.iterator(); while ( liveReads.hasNext() ) { boolean killed = false; @@ -445,6 +516,11 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + + if ( LOG_READ_CARRYING ) + logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", + activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive, activeRegion.size(), maxReadsInMemory)); + final M x = walker.map(activeRegion, null); return walker.reduce( x, sum ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index f937c2458..24e512a7b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -40,7 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; @@ -61,18 +61,32 @@ import java.util.*; @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) @RemoveProgramRecords public abstract class ActiveRegionWalker extends Walker { + /** + * If provided, this walker will write out its activity profile (per bp probabilities of being active) + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the activity profile calculations easier + */ + @Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false) + public PrintStream activityProfileOutStream = null; - @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) + /** + * If provided, this walker will write out its active and inactive regions + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the active region calculations easier + */ + @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false) public PrintStream activeRegionOutStream = null; @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) protected List> activeRegionBindings = null; - public GenomeLocSortedSet presetActiveRegions = null; - - public boolean hasPresetActiveRegions() { - return presetActiveRegions != null; - } + private GenomeLocSortedSet presetActiveRegions = null; @Override public void initialize() { @@ -91,6 +105,22 @@ public abstract class ActiveRegionWalker extends Walker extends Walker= 0.0", "result.isActiveProb <= 1.0"}) - public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); // Map over the ActiveRegion public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7462416bc..f1f0ab9b1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -634,6 +634,30 @@ public class MathUtils { return normalizeFromLog10(array, false); } + /** + * normalizes the real-space probability array. + * + * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok + * to have values in the array of > 1, or have the sum go above 0. + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + @Requires("array != null") + @Ensures({"result != null"}) + public static double[] normalizeFromRealSpace(final double[] array) { + if ( array.length == 0 ) + return array; + + final double sum = sum(array); + final double[] normalized = new double[array.length]; + if ( sum < 0.0 || sum > 1.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); + for ( int i = 0; i < array.length; i++ ) { + normalized[i] = array[i] / sum; + } + return normalized; + } + public static int maxElementIndex(final double[] array) { return maxElementIndex(array, array.length); } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 809d4867e..66485c8cf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -44,6 +45,7 @@ import java.util.ArrayList; public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); + private final List supportingStates; private final GenomeLoc activeRegionLoc; private final GenomeLoc extendedLoc; private final int extension; @@ -51,8 +53,9 @@ public class ActiveRegion implements HasGenomeLocation { private final GenomeLocParser genomeLocParser; public final boolean isActive; - public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { this.activeRegionLoc = activeRegionLoc; + this.supportingStates = new ArrayList(supportingStates); this.isActive = isActive; this.genomeLocParser = genomeLocParser; this.extension = extension; @@ -62,7 +65,7 @@ public class ActiveRegion implements HasGenomeLocation { @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString(); + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive + " nReads=" + reads.size() + " "; } // add each read to the bin and extend the reference genome activeRegionLoc if needed @@ -112,6 +115,8 @@ public class ActiveRegion implements HasGenomeLocation { public GenomeLoc getExtendedLoc() { return extendedLoc; } public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } + public List getSupportingStates() { return supportingStates; } + public int getExtension() { return extension; } public int size() { return reads.size(); } public void clearReads() { reads.clear(); } @@ -126,19 +131,4 @@ public class ActiveRegion implements HasGenomeLocation { if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } - - /** - * A comparator class which is used to sort ActiveRegions by their start location - */ - /* - public static class ActiveRegionStartLocationComparator implements Comparator { - - public ActiveRegionStartLocationComparator() {} - - @Override - public int compare(final ActiveRegion left, final ActiveRegion right) { - return left.getLocation().compareTo(right.getLocation()); - } - } - */ } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java index d9b458f51..5da88cb6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -26,12 +26,11 @@ package org.broadinstitute.sting.utils.activeregion; /** - * Created with IntelliJ IDEA. + * Describes how a read relates to an assigned ActiveRegion + * * User: thibault * Date: 11/26/12 * Time: 2:35 PM - * - * Describes how a read relates to an assigned ActiveRegion */ public enum ActiveRegionReadState { PRIMARY, // This is the read's primary region diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 909d99424..ab9095106 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -1,39 +1,36 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Class holding information about per-base activity scores for the @@ -43,34 +40,24 @@ import java.util.List; * @since Date created */ public class ActivityProfile { - final GenomeLocParser parser; - final boolean presetRegions; - GenomeLoc regionStartLoc = null; - GenomeLoc regionStopLoc = null; - final List isActiveList; - private static final int FILTER_SIZE = 80; - private static final double[] GaussianKernel; + private final static int MAX_PROB_PROPOGATION_DISTANCE = 10; + private final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author - static { - GaussianKernel = new double[2*FILTER_SIZE + 1]; - for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { - GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 55.0, iii); - } - } + protected final List stateList; + protected final GenomeLocParser parser; - // todo -- add upfront the start and stop of the intervals - // todo -- check that no regions are unexpectedly missing - // todo -- add unit tests - // TODO -- own preset regions - public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) { - this(parser, presetRegions, new ArrayList(), null); - } + protected GenomeLoc regionStartLoc = null; + protected GenomeLoc regionStopLoc = null; + + /** + * Create a new empty ActivityProfile + * @param parser the parser we can use to create genome locs, cannot be null + */ + public ActivityProfile(final GenomeLocParser parser) { + if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); - protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List isActiveList, final GenomeLoc regionStartLoc) { this.parser = parser; - this.presetRegions = presetRegions; - this.isActiveList = isActiveList; - this.regionStartLoc = regionStartLoc; + this.stateList = new ArrayList(); } @Override @@ -82,149 +69,316 @@ public class ActivityProfile { } /** - * Add the next ActivityProfileResult to this profile. + * How far away can probability mass be moved around in this profile? + * + * This distance puts an upper limit on how far, in bp, we will ever propogate probability max around + * when adding a new ActivityProfileState. For example, if the value of this function is + * 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have + * their probability propograted back to that state. + * + * @return a positive integer distance in bp + */ + @Ensures("result >= 0") + public int getMaxProbPropagationDistance() { + return MAX_PROB_PROPOGATION_DISTANCE; + } + + /** + * How many profile results are in this profile? + * @return the number of profile results + */ + @Ensures("result >= 0") + public int size() { + return stateList.size(); + } + + /** + * Is this profile empty? + * @return true if the profile is empty + */ + @Ensures("isEmpty() == (size() == 0)") + public boolean isEmpty() { + return stateList.isEmpty(); + } + + /** + * Get the span of this activity profile, which is from the start of the first state to the stop of the last + * @return a potentially null GenomeLoc. Will be null if this profile is empty + */ + public GenomeLoc getSpan() { + return isEmpty() ? null : regionStartLoc.endpointSpan(regionStopLoc); + } + + @Requires("! isEmpty()") + public int getContigIndex() { + return regionStartLoc.getContigIndex(); + } + + @Requires("! isEmpty()") + public int getStop() { + return regionStopLoc.getStop(); + } + + /** + * Get the list of active profile results in this object + * @return a non-null, ordered list of active profile results + */ + @Ensures("result != null") + protected List getStateList() { + return stateList; + } + + /** + * Get the probabilities of the states as a single linear array of doubles + * @return a non-null array + */ + @Ensures("result != null") + protected double[] getProbabilitiesAsArray() { + final double[] probs = new double[getStateList().size()]; + int i = 0; + for ( final ActivityProfileState state : getStateList() ) + probs[i++] = state.isActiveProb; + return probs; + } + + /** + * Helper function that gets the genome loc for a site offset from relativeLoc, protecting ourselves from + * falling off the edge of the contig. + * + * @param relativeLoc the location offset is relative to + * @param offset the offset from relativeLoc where we'd like to create a GenomeLoc + * @return a genome loc with relativeLoc.start + offset, if this is on the contig, null otherwise + */ + @Requires("relativeLoc != null") + protected GenomeLoc getLocForOffset(final GenomeLoc relativeLoc, final int offset) { + final int start = relativeLoc.getStart() + offset; + if ( start < 0 || start > getCurrentContigLength() ) { + return null; + } else { + return parser.createGenomeLoc(regionStartLoc.getContig(), start); + } + } + + /** + * Get the length of the current contig + * @return the length in bp + */ + @Requires("regionStartLoc != null") + @Ensures("result > 0") + private int getCurrentContigLength() { + // TODO -- fix performance problem with getContigInfo + return parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength(); + } + + // -------------------------------------------------------------------------------- + // + // routines to add states to a profile + // + // -------------------------------------------------------------------------------- + + /** + * Add the next ActivityProfileState to this profile. * * Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown * - * @param result a well-formed ActivityProfileResult result to incorporate into this profile + * @param state a well-formed ActivityProfileState result to incorporate into this profile */ - @Requires("result != null") - public void add(final ActivityProfileResult result) { - final GenomeLoc loc = result.getLoc(); + @Requires("state != null") + public void add(final ActivityProfileState state) { + final GenomeLoc loc = state.getLoc(); if ( regionStartLoc == null ) { regionStartLoc = loc; regionStopLoc = loc; } else { + // TODO -- need to figure out where to add loc as the regions will be popping off the front if ( regionStopLoc.getStart() != loc.getStart() - 1 ) throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); regionStopLoc = loc; } - isActiveList.add(result); - } - - public int size() { - return isActiveList.size(); - } - - public boolean isEmpty() { - return isActiveList.isEmpty(); - } - - public boolean hasPresetRegions() { - return presetRegions; + final Collection processedStates = processState(state); + for ( final ActivityProfileState processedState : processedStates ) { + incorporateSingleState(processedState); + } } /** - * Band pass this ActivityProfile, producing a new profile that's band pass filtered - * @return a new ActivityProfile that's the band-pass filtered version of this profile + * Incorporate a single activity profile state into the current list of states + * + * If state's position occurs immediately after the last position in this profile, then + * the state is appended to the state list. If it's within the existing states list, + * the prob of stateToAdd is added to its corresponding state in the list. If the + * position would be before the start of this profile, stateToAdd is simply ignored. + * + * @param stateToAdd the state we want to add to the states list */ - public ActivityProfile bandPassFilter() { - final double[] activeProbArray = new double[isActiveList.size()]; - int iii = 0; - for( final ActivityProfileResult result : isActiveList ) { - activeProbArray[iii++] = result.isActiveProb; - } - iii = 0; - for( final ActivityProfileResult result : isActiveList ) { - if( result.resultState.equals(ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS) ) { // special code to deal with the problem that high quality soft clipped bases aren't added to pileups - final int numHQClips = result.resultValue.intValue(); - for( int jjj = Math.max(0, iii - numHQClips); jjj < Math.min(activeProbArray.length, iii+numHQClips); jjj++ ) { - activeProbArray[jjj] = Math.max(activeProbArray[jjj], activeProbArray[iii]); - } - } - iii++; - } + @Requires("stateToAdd != null") + private void incorporateSingleState(final ActivityProfileState stateToAdd) { + final int position = stateToAdd.getOffset(regionStartLoc); - final double[] filteredProbArray; - if( !presetRegions ) { - // if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray - filteredProbArray = new double[activeProbArray.length]; - for( iii = 0; iii < activeProbArray.length; iii++ ) { - final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); - final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); - filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + if ( position > size() ) + // should we allow this? probably not + throw new IllegalArgumentException("Must add state contiguous to existing states"); + + if ( position >= 0 ) { + // ignore states starting before this regions start + if ( position < size() ) { + stateList.get(position).isActiveProb += stateToAdd.isActiveProb; + } else { + if ( position != size() ) throw new IllegalStateException("position == size but it wasn't"); + stateList.add(stateToAdd); } + } + } + + /** + * Process justAddedState, returning a collection of derived states that actually be added to the stateList + * + * The purpose of this function is to transform justAddedStates, if needed, into a series of atomic states + * that we actually want to track. For example, if state is for soft clips, we transform that single + * state into a list of states that surround the state up to the distance of the soft clip. + * + * Can be overridden by subclasses to transform states in any way + * + * There's no particular contract for the output states, except that they can never refer to states + * beyond the current end of the stateList unless the explictly include preceding states before + * the reference. So for example if the current state list is [1, 2, 3] this function could return + * [1,2,3,4,5] but not [1,2,3,5]. + * + * @param justAddedState the state our client provided to use to add to the list + * @return a list of derived states that should actually be added to this profile's state list + */ + protected Collection processState(final ActivityProfileState justAddedState) { + if ( justAddedState.resultState.equals(ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS) ) { + // special code to deal with the problem that high quality soft clipped bases aren't added to pileups + final List states = new LinkedList(); + final int numHQClips = justAddedState.resultValue.intValue(); + for( int jjj = - numHQClips; jjj <= numHQClips; jjj++ ) { + final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj); + if ( loc != null ) + states.add(new ActivityProfileState(loc, justAddedState.isActiveProb)); + } + + return states; } else { - // otherwise we simply use the activeProbArray directly - filteredProbArray = activeProbArray; + return Collections.singletonList(justAddedState); } - - iii = 0; - for( final double prob : filteredProbArray ) { - final ActivityProfileResult result = isActiveList.get(iii++); - result.isActiveProb = prob; - result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE; - result.resultValue = null; - } - - return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc); } - /** - * Partition this profile into active regions - * @param activeRegionExtension the amount of margin overlap in the active region - * @return the list of active regions - */ - public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { - final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author - final ArrayList returnList = new ArrayList(); + // -------------------------------------------------------------------------------- + // + // routines to get active regions from the profile + // + // -------------------------------------------------------------------------------- - if( isActiveList.size() == 0 ) { - // no elements in the active list, just return an empty one - return Collections.emptyList(); - } else if( isActiveList.size() == 1 ) { - // there's a single element, it's either active or inactive - boolean isActive = isActiveList.get(0).isActiveProb > ACTIVE_PROB_THRESHOLD; - returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize)); - } else { - // there are 2+ elements, divide these up into regions - boolean isActive = isActiveList.get(0).isActiveProb > ACTIVE_PROB_THRESHOLD; - int curStart = 0; - for(int iii = 1; iii < isActiveList.size(); iii++ ) { - final boolean thisStatus = isActiveList.get(iii).isActiveProb > ACTIVE_PROB_THRESHOLD; - if( isActive != thisStatus ) { - returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize)); - isActive = thisStatus; - curStart = iii; - } + /** + * Get the next completed active regions from this profile, and remove all states supporting them from this profile + * + * Takes the current profile and finds all of the active / inactive from the start of the profile that are + * ready. By ready we mean unable to have their probability modified any longer by future additions to the + * profile. The regions that are popped off the profile take their states with them, so the start of this + * profile will always be after the end of the last region returned here. + * + * The regions are returned sorted by genomic position. + * + * This function may not return anything in the list, if no regions are ready + * + * No returned region will be larger than maxRegionSize. + * + * @param activeRegionExtension the extension value to provide to the constructed regions + * @param maxRegionSize the maximize size of the returned region + * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the + * stateList. Used to close out the active region when we've hit some kind of end (such + * as the end of the contig) + * @return a non-null list of active regions + */ + @Ensures("result != null") + public List popReadyActiveRegions(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) { + if ( activeRegionExtension < 0 ) throw new IllegalArgumentException("activeRegionExtension must be >= 0 but got " + activeRegionExtension); + if ( maxRegionSize < 1 ) throw new IllegalArgumentException("maxRegionSize must be >= 1 but got " + maxRegionSize); + + final LinkedList regions = new LinkedList(); + + while ( true ) { + final ActiveRegion nextRegion = popNextReadyActiveRegion(activeRegionExtension, maxRegionSize, forceConversion); + if ( nextRegion == null ) + return regions; + else { + regions.add(nextRegion); } - returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region } - return returnList; } /** - * Helper routine to create an active region based on our current start and end offsets - * @param isActive should the region be active? - * @param curStart offset (0-based) from the start of this region - * @param curEnd offset (0-based) from the start of this region - * @param activeRegionExtension the amount of margin overlap in the active region - * @return a fully initialized ActiveRegion with the above properties + * Helper function for popReadyActiveRegions that pops the first ready region off the front of this profile + * + * If a region is returned, modifies the state of this profile so that states used to make the region are + * no longer part of the profile. Associated information (like the region start position) of this profile + * are also updated. + * + * @param activeRegionExtension the extension value to provide to the constructed regions + * @param maxRegionSize the maximize size of the returned region + * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the + * stateList. Used to close out the active region when we've hit some kind of end (such + * as the end of the contig) + * @return a fully formed active region, or null if none can be made */ - private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { - return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + private ActiveRegion popNextReadyActiveRegion(final int activeRegionExtension, final int maxRegionSize, final boolean forceConversion) { + if ( stateList.isEmpty() ) + return null; + + final ActivityProfileState first = stateList.get(0); + final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD; + final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, maxRegionSize, forceConversion); + if ( offsetOfNextRegionEnd == -1 ) + // couldn't find a valid ending offset, so we return null + return null; + + // we need to create the active region, and clip out the states we're extracting from this profile + final List sub = stateList.subList(0, offsetOfNextRegionEnd + 1); + final List supportingStates = new ArrayList(sub); + sub.clear(); + + // update the start and stop locations as necessary + if ( stateList.isEmpty() ) { + regionStartLoc = regionStopLoc = null; + } else { + regionStartLoc = stateList.get(0).getLoc(); + } + final GenomeLoc regionLoc = parser.createGenomeLoc(first.getLoc().getContig(), first.getLoc().getStart(), first.getLoc().getStart() + offsetOfNextRegionEnd); + return new ActiveRegion(regionLoc, supportingStates, isActiveRegion, parser, activeRegionExtension); } - private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { - if( !isActive || curEnd - curStart < maxRegionSize ) { - final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); - returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); - return returnList; + /** + * Find the end of the current region, returning the index into the element isActive element, or -1 if the region isn't done + * + * The current region is defined from the start of the stateList, looking for elements that have the same isActiveRegion + * flag (i.e., if isActiveRegion is true we are looking for states with isActiveProb > threshold, or alternatively + * for states < threshold). The maximize size of the returned region is maxRegionSize. If forceConversion is + * true, then we'll return the region end even if this isn't safely beyond the max prob propogation distance. + * + * @param isActiveRegion is the region we're looking for an active region or inactive region? + * @param maxRegionSize the maximize size of the returned region + * @param forceConversion if true, we'll return a region whose end isn't sufficiently far from the end of the + * stateList. Used to close out the active region when we've hit some kind of end (such + * as the end of the contig) + * @return the index into stateList of the last element of this region, or -1 if it cannot be found + */ + @Ensures({ + "result >= -1", + "result == -1 || result < maxRegionSize", + "! (result == -1 && forceConversion)"}) + private int findEndOfRegion(final boolean isActiveRegion, final int maxRegionSize, final boolean forceConversion) { + int i = 0; + while ( i < stateList.size() && i < maxRegionSize ) { + if ( stateList.get(i).isActiveProb > ACTIVE_PROB_THRESHOLD != isActiveRegion ) { + break; + } + i++; } - // find the best place to break up the large active region - Double minProb = Double.MAX_VALUE; - int cutPoint = -1; - final int size = curEnd - curStart + 1; - for( int iii = curStart + (int)(size*0.15); iii < curEnd - (int)(size*0.15); iii++ ) { - if( isActiveList.get(iii).isActiveProb < minProb ) { minProb = isActiveList.get(iii).isActiveProb; cutPoint = iii; } - } - final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); - final List rightList = createActiveRegion(isActive, cutPoint+1, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); - returnList.addAll( leftList ); - returnList.addAll( rightList ); - return returnList; + // we're one past the end, so i must be decremented + return forceConversion || i + getMaxProbPropagationDistance() < stateList.size() ? i - 1 : -1; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java similarity index 69% rename from public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java rename to public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java index bf2636465..272596be3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileState.java @@ -30,44 +30,45 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; /** - * Created with IntelliJ IDEA. + * The state of an active region walker's isActive call at a specific locus in the genome + * * User: rpoplin * Date: 7/27/12 */ -public class ActivityProfileResult { - private GenomeLoc loc; +public class ActivityProfileState { + final private GenomeLoc loc; public double isActiveProb; - public ActivityProfileResultState resultState; + public Type resultState; public Number resultValue; - public enum ActivityProfileResultState { + public enum Type { NONE, HIGH_QUALITY_SOFT_CLIPS } /** - * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb + * Create a new ActivityProfileState at loc with probability of being active of isActiveProb * * @param loc the position of the result profile (for debugging purposes) * @param isActiveProb the probability of being active (between 0 and 1) */ @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) - public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) { - this(loc, isActiveProb, ActivityProfileResultState.NONE, null); + public ActivityProfileState(final GenomeLoc loc, final double isActiveProb) { + this(loc, isActiveProb, Type.NONE, null); } /** - * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some + * Create a new ActivityProfileState at loc with probability of being active of isActiveProb that maintains some * information about the result state and value (TODO RYAN -- what do these mean?) * * @param loc the position of the result profile (for debugging purposes) * @param isActiveProb the probability of being active (between 0 and 1) */ @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) - public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + public ActivityProfileState(final GenomeLoc loc, final double isActiveProb, final Type resultState, final Number resultValue) { // make sure the location of that activity profile is 1 if ( loc.size() != 1 ) - throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc); + throw new IllegalArgumentException("Location for an ActivityProfileState must have to size 1 bp but saw " + loc); this.loc = loc; this.isActiveProb = isActiveProb; @@ -76,7 +77,17 @@ public class ActivityProfileResult { } /** - * Get the genome loc associated with the ActivityProfileResult + * The offset of state w.r.t. our current region's start location + * @param regionStartLoc the start of the region, as a genome loc + * @return the position of this profile relative to the start of this region + */ + public int getOffset(final GenomeLoc regionStartLoc) { + return getLoc().getStart() - regionStartLoc.getStart(); + } + + + /** + * Get the genome loc associated with the ActivityProfileState * @return the location of this result */ @Ensures("result != null") @@ -86,7 +97,7 @@ public class ActivityProfileResult { @Override public String toString() { - return "ActivityProfileResult{" + + return "ActivityProfileState{" + "loc=" + loc + ", isActiveProb=" + isActiveProb + ", resultState=" + resultState + diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java new file mode 100644 index 000000000..1a8bac086 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfile.java @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Collection; +import java.util.LinkedList; + +/** + * A band pass filtering version of the activity profile + * + * Applies a band pass filter with a Gaussian kernel to the input state probabilities to smooth + * them out of an interval + * + * @author Mark DePristo + * @since 2011 + */ +public class BandPassActivityProfile extends ActivityProfile { + public static final int DEFAULT_FILTER_SIZE = 80; + + private final int filterSize; + private final double[] GaussianKernel; + + /** + * Create a band pass activity profile with the default band size + * @param parser our genome loc parser + */ + public BandPassActivityProfile(final GenomeLocParser parser) { + this(parser, DEFAULT_FILTER_SIZE); + } + + /** + * Create an activity profile that implements a band pass filter on the states + * @param parser our genome loc parser + * @param filterSize the size (in bp) of the band pass filter. The filter size is the number of bp to each + * side that are included in the band. So a filter size of 1 implies that the actual band + * is 3 bp, 1 for the center site and 1 on each size. 2 => 5, etc. + */ + public BandPassActivityProfile(final GenomeLocParser parser, final int filterSize) { + super(parser); + + if ( filterSize < 0 ) throw new IllegalArgumentException("Filter size must be greater than or equal to 0 but got " + filterSize); + + // setup the Gaussian kernel for the band pass filter + this.filterSize = filterSize; + final double[] kernel = new double[getBandSize()]; + for( int iii = 0; iii < 2* filterSize + 1; iii++ ) { + kernel[iii] = MathUtils.NormalDistribution(filterSize, 55.0, iii); + } + this.GaussianKernel = MathUtils.normalizeFromRealSpace(kernel); + } + + /** + * Our maximize propagation distance is whatever our parent's is, plus our filter size + * + * Stops the profile from interpreting sites that aren't yet fully determined due to + * propagation of the probabilities. + * + * @return the distance in bp we might move our probabilities around for some site i + */ + @Override + public int getMaxProbPropagationDistance() { + return super.getMaxProbPropagationDistance() + filterSize; + } + + /** + * Get the size (in bp) of the band pass filter + * @return a positive integer + */ + @Ensures("result >= 1") + public int getBandSize() { + return 2 * filterSize + 1; + } + + /** + * Get the filter size (which is the size of each wing of the band, minus the center point) + * @return a positive integer + */ + @Ensures("result >= 0") + public int getFilteredSize() { + return filterSize; + } + + /** + * Get the kernel of this band pass filter. Do not modify returned result + * @return the kernel used in this band pass filter + */ + @Ensures({"result != null", "result.length == getBandSize()"}) + protected double[] getKernel() { + return GaussianKernel; + } + + /** + * Band pass the probabilities in the ActivityProfile, producing a new profile that's band pass filtered + * @return a new double[] that's the band-pass filtered version of this profile + */ + @Override + protected Collection processState(final ActivityProfileState justAddedState) { + final Collection states = new LinkedList(); + + for ( final ActivityProfileState superState : super.processState(justAddedState) ) { + for( int jjj = -filterSize; jjj <= filterSize; jjj++ ) { + final GenomeLoc loc = getLocForOffset(justAddedState.getLoc(), jjj); + if ( loc != null ) { + final double newProb = superState.isActiveProb * GaussianKernel[jjj + filterSize]; + states.add(new ActivityProfileState(loc, newProb)); + } + } + } + + return states; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java index 82b5b29cc..bf3045c71 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -62,6 +62,7 @@ public class ArtificialBAMBuilder { int alignmentStart = 1; int readLength = 10; private final ArrayList samples = new ArrayList(); + private List createdReads = null; private LinkedList additionalReads = new LinkedList(); @@ -102,6 +103,7 @@ public class ArtificialBAMBuilder { } public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { + createdReads = null; this.header = new SAMFileHeader(); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); header.setSequenceDictionary(parser.getContigs()); @@ -120,10 +122,12 @@ public class ArtificialBAMBuilder { } public void addReads(final GATKSAMRecord readToAdd) { + createdReads = null; additionalReads.add(readToAdd); } public void addReads(final Collection readsToAdd) { + createdReads = null; additionalReads.addAll(readsToAdd); } @@ -140,26 +144,34 @@ public class ArtificialBAMBuilder { * @return a ordered list of reads */ public List makeReads() { - final String baseName = "read"; - List reads = new ArrayList(nReadsPerLocus*nLoci); - for ( int locusI = 0; locusI < nLoci; locusI++) { - final int locus = locusI * (skipNLoci + 1); - for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { - for ( final SAMReadGroupRecord rg : header.getReadGroups() ) { - final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength); - read.setReadGroup(new GATKSAMReadGroupRecord(rg)); - reads.add(read); + if ( createdReads == null ) { + final String baseName = "read"; + final LinkedList readGroups = new LinkedList(); + for ( final SAMReadGroupRecord rg : header.getReadGroups()) + readGroups.add(new GATKSAMReadGroupRecord(rg)); + + List reads = new ArrayList(nReadsPerLocus*nLoci); + for ( int locusI = 0; locusI < nLoci; locusI++) { + final int locus = locusI * (skipNLoci + 1); + for ( int readI = 0; readI < nReadsPerLocus; readI++ ) { + for ( final GATKSAMReadGroupRecord rg : readGroups ) { + final String readName = String.format("%s.%d.%d.%s", baseName, locus, readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, alignmentStart + locus, readLength); + read.setReadGroup(rg); + reads.add(read); + } } } + + if ( ! additionalReads.isEmpty() ) { + reads.addAll(additionalReads); + Collections.sort(reads, new SAMRecordCoordinateComparator()); + } + + createdReads = new ArrayList(reads); } - if ( ! additionalReads.isEmpty() ) { - reads.addAll(additionalReads); - Collections.sort(reads, new SAMRecordCoordinateComparator()); - } - - return reads; + return createdReads; } /** @@ -192,13 +204,13 @@ public class ArtificialBAMBuilder { public int getnReadsPerLocus() { return nReadsPerLocus; } public int getnLoci() { return nLoci; } public int getSkipNLoci() { return skipNLoci; } - public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; return this; } + public ArtificialBAMBuilder setSkipNLoci(int skipNLoci) { this.skipNLoci = skipNLoci; createdReads = null; return this; } public int getAlignmentStart() { return alignmentStart; } - public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; return this; } + public ArtificialBAMBuilder setAlignmentStart(int alignmentStart) { this.alignmentStart = alignmentStart; createdReads = null; return this; } public int getReadLength() { return readLength; } - public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; return this; } + public ArtificialBAMBuilder setReadLength(int readLength) { this.readLength = readLength; createdReads = null; return this; } public SAMFileHeader getHeader() { return header; } - public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; createdReads = null; return this; } public int getAlignmentEnd() { return alignmentStart + nLoci * (skipNLoci + 1) + readLength; diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java index 76be54d72..e2cad88a1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import java.util.*; @@ -51,6 +51,7 @@ class DummyActiveRegionWalker extends ActiveRegionWalker { protected List isActiveCalls = new ArrayList(); protected Map mappedActiveRegions = new LinkedHashMap(); + private boolean declareHavingPresetRegions = false; public DummyActiveRegionWalker() { this(1.0); @@ -60,30 +61,41 @@ class DummyActiveRegionWalker extends ActiveRegionWalker { this.prob = constProb; } - public DummyActiveRegionWalker(EnumSet wantStates) { - this(1.0); + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, EnumSet wantStates, final boolean declareHavingPresetRegions) { + this(activeRegions, declareHavingPresetRegions); this.states = wantStates; } - public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) { + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, final boolean declareHavingPresetRegions) { this(1.0); this.activeRegions = activeRegions; + this.declareHavingPresetRegions = declareHavingPresetRegions; } public void setStates(EnumSet states) { this.states = states; } + @Override + public boolean hasPresetActiveRegions() { + return declareHavingPresetRegions; + } + + @Override + public GenomeLocSortedSet getPresetActiveRegions() { + return declareHavingPresetRegions ? activeRegions : null; + } + @Override public EnumSet desiredReadStates() { return states; } @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { isActiveCalls.add(ref.getLocus()); final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; - return new ActivityProfileResult(ref.getLocus(), p); + return new ActivityProfileState(ref.getLocus(), p); } @Override diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 76eac3a8d..a574932a7 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -179,7 +179,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testActiveRegionCoverage(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true); Collection activeRegions = getActiveRegions(t, walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); @@ -242,9 +242,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "TraversalEngineProvider") + @Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider") public void testPrimaryReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY), + true); // Contract: Each read has the Primary state in a single region (or none) // This is the region of maximum overlap for the read (earlier if tied) @@ -275,20 +277,18 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region); - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region); - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testNonPrimaryReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + true); // Contract: Each read has the Primary state in a single region (or none) // This is the region of maximum overlap for the read (earlier if tied) @@ -321,10 +321,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -333,8 +330,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testExtendedReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED), + true); // Contract: Each read has the Primary state in a single region (or none) // This is the region of maximum overlap for the read (earlier if tied) @@ -368,10 +366,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); @@ -384,6 +379,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private void verifyReadMapping(ActiveRegion region, String... reads) { + Assert.assertNotNull(region, "Region was unexpectedly null"); final Set regionReads = new HashSet(); for (SAMRecord read : region.getReads()) { Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName()); @@ -530,12 +526,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { for ( final int start : starts ) { for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { for ( final int nLoci : Arrays.asList(1, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); for ( EnumSet readStates : allReadStates ) { - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); - bamBuilder.setReadLength(readLength); - bamBuilder.setSkipNLoci(skips); - bamBuilder.setAlignmentStart(start); - for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { nTests++; if ( nTests < maxTests ) // && nTests == 1238 ) @@ -595,7 +590,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) ); - final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); walker.setStates(readStates); final TraverseActiveRegions traversal = new TraverseActiveRegions(); @@ -619,8 +614,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { alreadySeenReads.add(read.getReadName()); } - Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region + - " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"); + String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"; + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg); nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; } @@ -642,7 +638,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // // --------------------------------------------------------------------------------------------------------- - @Test + @Test(enabled = true && ! DEBUG) public void ensureAllInsertionReadsAreInActiveRegions() { final int readLength = 10; @@ -667,7 +663,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) ); - final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); final TraverseActiveRegions traversal = new TraverseActiveRegions(); final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java new file mode 100644 index 000000000..019cf82da --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileStateUnitTest.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 1/17/13 + * Time: 2:30 PM + * To change this template use File | Settings | File Templates. + */ +public class ActivityProfileStateUnitTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @DataProvider(name = "ActiveProfileResultProvider") + public Object[][] makeActiveProfileResultProvider() { + final List tests = new LinkedList(); + + final String chr = genomeLocParser.getContigs().getSequence(0).getSequenceName(); + for ( final GenomeLoc loc : Arrays.asList( + genomeLocParser.createGenomeLoc(chr, 10, 10), + genomeLocParser.createGenomeLoc(chr, 100, 100) )) { + for ( final double prob : Arrays.asList(0.0, 0.5, 1.0) ) { + for ( final ActivityProfileState.Type state : ActivityProfileState.Type.values() ) { + for ( final Number value : Arrays.asList(1, 2, 4) ) { + tests.add(new Object[]{ loc, prob, state, value}); + } + } + tests.add(new Object[]{ loc, prob, null, null}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ActiveProfileResultProvider") + public void testActiveProfileResultProvider(GenomeLoc loc, final double prob, ActivityProfileState.Type maybeState, final Number maybeNumber) { + final ActivityProfileState apr = maybeState == null + ? new ActivityProfileState(loc, prob) + : new ActivityProfileState(loc, prob, maybeState, maybeNumber); + + Assert.assertEquals(apr.getLoc(), loc); + Assert.assertNotNull(apr.toString()); + Assert.assertEquals(apr.isActiveProb, prob); + Assert.assertEquals(apr.resultState, maybeState == null ? ActivityProfileState.Type.NONE : maybeState); + Assert.assertEquals(apr.resultValue, maybeState == null ? null : maybeNumber); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index ff27037d3..311d43206 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -1,27 +1,27 @@ /* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ package org.broadinstitute.sting.utils.activeregion; @@ -42,9 +42,7 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import java.util.*; public class ActivityProfileUnitTest extends BaseTest { @@ -70,23 +68,28 @@ public class ActivityProfileUnitTest extends BaseTest { List expectedRegions; int extension = 0; GenomeLoc regionStart = startLoc; + final ProfileType type; - public BasicActivityProfileTestProvider(final List probs, final List expectedRegions) { - super(BasicActivityProfileTestProvider.class); - this.probs = probs; - this.expectedRegions = expectedRegions; - setName(getName()); - } - - public BasicActivityProfileTestProvider(final List probs, boolean startActive, int ... startsAndStops) { + public BasicActivityProfileTestProvider(final ProfileType type, final List probs, boolean startActive, int ... startsAndStops) { super(BasicActivityProfileTestProvider.class); + this.type = type; this.probs = probs; this.expectedRegions = toRegions(startActive, startsAndStops); setName(getName()); } private String getName() { - return String.format("probs=%s expectedRegions=%s", Utils.join(",", probs), Utils.join(",", expectedRegions)); + return String.format("type=%s probs=%s expectedRegions=%s", type, Utils.join(",", probs), Utils.join(",", expectedRegions)); + } + + public ActivityProfile makeProfile() { + switch ( type ) { + case Base: return new ActivityProfile(genomeLocParser); + case BandPass: + // zero size => equivalent to ActivityProfile + return new BandPassActivityProfile(genomeLocParser, 0); + default: throw new IllegalStateException(type.toString()); + } } private List toRegions(boolean isActive, int[] startsAndStops) { @@ -95,7 +98,7 @@ public class ActivityProfileUnitTest extends BaseTest { int start = regionStart.getStart() + startsAndStops[i]; int end = regionStart.getStart() + startsAndStops[i+1] - 1; GenomeLoc activeLoc = genomeLocParser.createGenomeLoc(regionStart.getContig(), start, end); - ActiveRegion r = new ActiveRegion(activeLoc, isActive, genomeLocParser, extension); + ActiveRegion r = new ActiveRegion(activeLoc, null, isActive, genomeLocParser, extension); l.add(r); isActive = ! isActive; } @@ -103,34 +106,44 @@ public class ActivityProfileUnitTest extends BaseTest { } } + private enum ProfileType { + Base, BandPass + } + @DataProvider(name = "BasicActivityProfileTestProvider") public Object[][] makeQualIntervalTestProvider() { - new BasicActivityProfileTestProvider(Arrays.asList(1.0), true, 0, 1); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0), true, 0, 1, 2); - new BasicActivityProfileTestProvider(Arrays.asList(0.0, 1.0), false, 0, 1, 2); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); - new BasicActivityProfileTestProvider(Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + for ( final ProfileType type : ProfileType.values() ) { + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0), true, 0, 1); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0), true, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(0.0, 1.0), false, 0, 1, 2); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 0.0, 1.0), true, 0, 1, 2, 3); + new BasicActivityProfileTestProvider(type, Arrays.asList(1.0, 1.0, 1.0), true, 0, 3); + } return BasicActivityProfileTestProvider.getTests(BasicActivityProfileTestProvider.class); } @Test(dataProvider = "BasicActivityProfileTestProvider") public void testBasicActivityProfile(BasicActivityProfileTestProvider cfg) { - ActivityProfile profile = new ActivityProfile(genomeLocParser, false); + ActivityProfile profile = cfg.makeProfile(); + + Assert.assertTrue(profile.isEmpty()); Assert.assertEquals(profile.parser, genomeLocParser); for ( int i = 0; i < cfg.probs.size(); i++ ) { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); - profile.add(new ActivityProfileResult(loc, p)); + profile.add(new ActivityProfileState(loc, p)); + Assert.assertFalse(profile.isEmpty(), "Profile shouldn't be empty after adding a state"); } - Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); + Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ), "Start loc should be the start of the region"); - Assert.assertEquals(profile.size(), cfg.probs.size()); - assertProbsAreEqual(profile.isActiveList, cfg.probs); + Assert.assertEquals(profile.size(), cfg.probs.size(), "Should have exactly the number of states we expected to add"); + assertProbsAreEqual(profile.stateList, cfg.probs); - assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); + // TODO -- reanble tests + //assertRegionsAreEqual(profile.createActiveRegions(0, 100), cfg.expectedRegions); } private void assertRegionsAreEqual(List actual, List expected) { @@ -140,12 +153,193 @@ public class ActivityProfileUnitTest extends BaseTest { } } - private void assertProbsAreEqual(List actual, List expected) { + private void assertProbsAreEqual(List actual, List expected) { Assert.assertEquals(actual.size(), expected.size()); for ( int i = 0; i < actual.size(); i++ ) { Assert.assertEquals(actual.get(i).isActiveProb, expected.get(i)); } } - // todo -- test extensions + // ------------------------------------------------------------------------------------- + // + // Hardcore tests for adding to the profile and constructing active regions + // + // ------------------------------------------------------------------------------------- + + private static class SizeToStringList extends ArrayList { + @Override public String toString() { return "List[" + size() + "]"; } + } + + @DataProvider(name = "RegionCreationTests") + public Object[][] makeRegionCreationTests() { + final List tests = new LinkedList(); + + final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength(); + for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10) ) { + for ( int regionSize : Arrays.asList(1, 10, 100, 1000, 10000) ) { + for ( int maxRegionSize : Arrays.asList(10, 50, 200) ) { + for ( final boolean waitUntilEnd : Arrays.asList(false, true) ) { + for ( final boolean forceConversion : Arrays.asList(false, true) ) { + // what do I really want to test here? I'd like to test a few cases: + // -- region is all active (1.0) + // -- region is all inactive (0.0) + // -- cut the interval into 1, 2, 3, 4, 5 ... 10 regions, each with alternating activity values + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + for ( int nParts : Arrays.asList(1, 2, 3, 4, 5, 7, 10, 11, 13) ) { + +// for ( int start : Arrays.asList(1) ) { +// for ( int regionSize : Arrays.asList(100) ) { +// for ( int maxRegionSize : Arrays.asList(10) ) { +// for ( final boolean waitUntilEnd : Arrays.asList(true) ) { +// for ( final boolean forceConversion : Arrays.asList(false) ) { +// for ( final boolean startWithActive : Arrays.asList(true) ) { +// for ( int nParts : Arrays.asList(3) ) { + regionSize = Math.min(regionSize, contigLength - start); + final List regions = makeRegions(regionSize, startWithActive, nParts); + tests.add(new Object[]{ start, regions, maxRegionSize, nParts, forceConversion, waitUntilEnd }); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private List makeRegions(final int totalRegionSize, + final boolean startWithActive, + final int nParts) { + final List regions = new SizeToStringList(); + + boolean isActive = startWithActive; + final int activeRegionSize = Math.max(totalRegionSize / nParts, 1); + for ( int i = 0; i < totalRegionSize; i += activeRegionSize ) { + for ( int j = 0; j < activeRegionSize && j + i < totalRegionSize; j++ ) { + regions.add(isActive); + } + isActive = ! isActive; + } + + return regions; + } + + + @Test(enabled = true, dataProvider = "RegionCreationTests") + public void testRegionCreation(final int start, final List probs, int maxRegionSize, final int nParts, final boolean forceConversion, final boolean waitUntilEnd) { + final ActivityProfile profile = new ActivityProfile(genomeLocParser); + Assert.assertNotNull(profile.toString()); + + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + final List seenSites = new ArrayList(Collections.nCopies(probs.size(), false)); + ActiveRegion lastRegion = null; + for ( int i = 0; i < probs.size(); i++ ) { + final boolean isActive = probs.get(i); + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); + final ActivityProfileState state = new ActivityProfileState(loc, isActive ? 1.0 : 0.0); + profile.add(state); + Assert.assertNotNull(profile.toString()); + + if ( ! waitUntilEnd ) { + final List regions = profile.popReadyActiveRegions(0, maxRegionSize, false); + lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); + } + } + + if ( waitUntilEnd || forceConversion ) { + final List regions = profile.popReadyActiveRegions(0, maxRegionSize, forceConversion); + lastRegion = assertGoodRegions(start, regions, maxRegionSize, lastRegion, probs, seenSites); + } + + for ( int i = 0; i < probs.size(); i++ ) { + if ( forceConversion || (i + maxRegionSize + profile.getMaxProbPropagationDistance() < probs.size())) + // only require a site to be seen if we are forcing conversion or the site is more than maxRegionSize from the end + Assert.assertTrue(seenSites.get(i), "Missed site " + i); + } + + Assert.assertNotNull(profile.toString()); + } + + private ActiveRegion assertGoodRegions(final int start, final List regions, final int maxRegionSize, ActiveRegion lastRegion, final List probs, final List seenSites) { + for ( final ActiveRegion region : regions ) { + Assert.assertTrue(region.getLocation().size() > 0, "Region " + region + " has a bad size"); + Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region " + region + " has a bad size: it's big than the max region size " + maxRegionSize); + if ( lastRegion != null ) { + Assert.assertTrue(region.getLocation().getStart() == lastRegion.getLocation().getStop() + 1, "Region " + region + " doesn't start immediately after previous region" + lastRegion); + } + + // check that all active bases are actually active + final int regionOffset = region.getLocation().getStart() - start; + Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start"); + for ( int j = 0; j < region.getLocation().size(); j++ ) { + final int siteOffset = j + regionOffset; + Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue()); + Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already"); + seenSites.set(siteOffset, true); + } + + lastRegion = region; + } + + return lastRegion; + } + + // ------------------------------------------------------------------------------------- + // + // Hardcore tests for adding to the profile and constructing active regions + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "SoftClipsTest") + public Object[][] makeSoftClipsTest() { + final List tests = new LinkedList(); + + final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength(); + for ( int start : Arrays.asList(1, 10, 100, contigLength - 100, contigLength - 10, contigLength - 1) ) { + for ( int precedingSites: Arrays.asList(0, 1, 10) ) { + if ( precedingSites + start < contigLength ) { + for ( int softClipSize : Arrays.asList(1, 2, 10, 100) ) { +// for ( int start : Arrays.asList(10) ) { +// for ( int precedingSites: Arrays.asList(10) ) { +// for ( int softClipSize : Arrays.asList(1) ) { + tests.add(new Object[]{ start, precedingSites, softClipSize }); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SoftClipsTest") + public void testSoftClips(final int start, int nPrecedingSites, final int softClipSize) { + final ActivityProfile profile = new ActivityProfile(genomeLocParser); + + final int contigLength = genomeLocParser.getContigs().getSequences().get(0).getSequenceLength(); + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + for ( int i = 0; i < nPrecedingSites; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); + final ActivityProfileState state = new ActivityProfileState(loc, 0.0); + profile.add(state); + } + + final GenomeLoc softClipLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); + profile.add(new ActivityProfileState(softClipLoc, 1.0, ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS, softClipSize)); + + if ( nPrecedingSites == 0 ) { + final int profileSize = Math.min(start + softClipSize, contigLength) - start + 1; + Assert.assertEquals(profile.size(), profileSize, "Wrong number of states in the profile"); + } + + for ( int i = 0; i < profile.size(); i++ ) { + final ActivityProfileState state = profile.getStateList().get(i); + final boolean withinSCRange = state.getLoc().distance(softClipLoc) <= softClipSize; + if ( withinSCRange ) { + Assert.assertTrue(state.isActiveProb > 0.0, "active prob should be changed within soft clip size"); + } else { + Assert.assertEquals(state.isActiveProb, 0.0, "active prob shouldn't be changed outside of clip size"); + } + } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java new file mode 100644 index 000000000..a2a85f1d0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.ReferenceSequenceFile; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class BandPassActivityProfileUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + } + + @DataProvider(name = "BandPassBasicTest") + public Object[][] makeBandPassTest() { + final List tests = new LinkedList(); + + for ( int start : Arrays.asList(1, 10, 100, 1000) ) { + for ( boolean precedingIsActive : Arrays.asList(true, false) ) { + for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) { + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) { +// for ( int start : Arrays.asList(10) ) { +// for ( boolean precedingIsActive : Arrays.asList(false) ) { +// for ( int precedingSites: Arrays.asList(0) ) { +// for ( int bandPassSize : Arrays.asList(1) ) { + tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize }); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BandPassBasicTest") + public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize) { + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize); + + final int expectedBandSize = bandPassSize * 2 + 1; + Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size"); + + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + final double precedingProb = precedingIsActive ? 1.0 : 0.0; + for ( int i = 0; i < nPrecedingSites; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); + final ActivityProfileState state = new ActivityProfileState(loc, precedingProb); + profile.add(state); + } + + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + + if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) { + // we have enough space that all probs fall on the genome + final double[] probs = profile.getProbabilitiesAsArray(); + Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states"); + } + } + + private double[] bandPassInOnePass(final BandPassActivityProfile profile, final double[] activeProbArray) { + final double[] bandPassProbArray = new double[activeProbArray.length]; + + // apply the band pass filter for activeProbArray into filteredProbArray + final double[] GaussianKernel = profile.getKernel(); + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); + final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); + bandPassProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); + } + + return bandPassProbArray; + } + + @DataProvider(name = "BandPassComposition") + public Object[][] makeBandPassComposition() { + final List tests = new LinkedList(); + + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassActivityProfile.DEFAULT_FILTER_SIZE) ) { + for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) { + tests.add(new Object[]{ bandPassSize, integrationLength }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "BandPassComposition") + public void testBandPassComposition(final int bandPassSize, final int integrationLength) { + final int start = 1; + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, bandPassSize); + final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; + + // add a buffer so that we can get all of the band pass values + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + int pos = start; + int rawProbsOffset = 0; + for ( int i = 0; i < bandPassSize; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++); + final ActivityProfileState state = new ActivityProfileState(loc, 0.0); + profile.add(state); + rawActiveProbs[rawProbsOffset++] = 0.0; + rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0; + } + + for ( int i = 0; i < integrationLength; i++ ) { + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + rawActiveProbs[rawProbsOffset++] = 1.0; + + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j); + Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j); + } + } + + final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs); + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j); + } + } +}