diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index 3712a8e51..ac028d860 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -85,7 +85,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Override public GenomeLoc map(final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker tracker) { - if (activeRegion.isActive) + if (activeRegion.isActive()) return activeRegion.getLocation(); else return null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 8b3eb9f1b..a27eaac8e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -463,15 +463,19 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! + if( !activeRegion.isActive()) { return 0; } // Not active so nothing to do! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails + + // note this operation must be performed before we clip the reads down, as this must correspond to the full reference region + final GenomeLoc fullSpanBeforeClipping = getPaddedLoc(activeRegion); + final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); - final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region @@ -495,7 +499,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem stratifiedReadMap, perSampleFilteredReadList, fullReferenceWithPadding, - getPaddedLoc(activeRegion), + fullSpanBeforeClipping, activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { @@ -505,9 +509,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if ( bamWriter != null ) { // write the haplotypes to the bam - final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); for ( Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedRefLoc, bestHaplotypes.contains(haplotype)); + writeHaplotype(haplotype, fullSpanBeforeClipping, bestHaplotypes.contains(haplotype)); // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently final Map alleleToHaplotypeMap = new HashMap(haplotypes.size()); @@ -519,7 +522,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for ( Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); if ( bestAllele != Allele.NO_CALL ) - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedRefLoc.getStart()); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), fullSpanBeforeClipping.getStart()); } } } @@ -559,7 +562,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } final ArrayList finalizedReadList = new ArrayList(); - final FragmentCollection fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) ); + final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); // Join overlapping paired reads to create a single longer read @@ -571,17 +574,20 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator()); // Loop through the reads hard clipping the adaptor and low quality tails + final ArrayList readsToUse = new ArrayList(finalizedReadList.size()); for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage - // BUGBUG: remove when positional downsampler is hooked up to ART/HC - if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { - activeRegion.add(clippedRead); + // TODO BUGBUG: remove when positional downsampler is hooked up to ART/HC + if( activeRegion.readOverlapsRegion(clippedRead) && + clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { + readsToUse.add(clippedRead); } } } + activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse)); } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { @@ -596,9 +602,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1); - final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength()); - return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight); + final int padLeft = Math.max(activeRegion.getReadSpanLoc().getStart()-REFERENCE_PADDING, 1); + final int padRight = Math.min(activeRegion.getReadSpanLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReadSpanLoc().getContig()).getSequenceLength()); + return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReadSpanLoc().getContig(), padLeft, padRight); } private HashMap> splitReadsBySample( final List reads ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 97b9ce746..d446da830 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "11290b619bc79b629cf29b8f428254ce"); + HCTest(CEUTRIO_BAM, "", "664a14590d0966e63d3aabff2d7bab0a"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "897abb2b4f98e9e460f373f9e0db5033"); + HCTest(NA12878_BAM, "", "111f3dc086a3cea1be9bd1ad6e1d18ed"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "efc2cae94069a1d6ee5fdcc7afeaa0ed"); + "c70f753f7918a1c670ce4ed5c66de09e"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "01f42c311fc3ce4f07ef86f8c01facfb"); + "b1d3070f0c49becf34101e480ab6c589"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "4c117c84d1abeade1dee3f7b52a4a585"); + "20eba2e54266f6aebf35b7b7b7e754e3"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "939847eb7bbafc798916acffdb1b5697"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9805488d85e59e1ae002d0d32d7011d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "25806874242973f00fb6f2a320ed4d9c"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "4544a2916f46f58b32db8008776b91a3"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c50b06d56cf3d0ef53e73a4973207949"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f3984a91e7562494c2a7e41fd05a6734"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ae2470e294d99ff2b825281b84730c72")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3e9e025c539be6c5e0d0f2e5d8e86a62")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("6f18ae64bf466476d780a083dcb5fc43")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("34129e6c6310ef4eeeeb59b0e7ac0464")); executeTest("HCTestStructuralIndels: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("ecdb8e30ec5dd91efc179ab6732499f9")); + Arrays.asList("5f4c07aaf1d2d34cccce43196a5fbd71")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("36a90309dde1a325c274388e302ffaa5")); + Arrays.asList("6ead001b1f8e7cb433fd335f78fde5f0")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 6933b45a7..5d2aa6be3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -460,7 +460,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); } if ( LOG_READ_CARRYING ) logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", - activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive, activeRegion.size(), maxReadsInMemory)); + activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); final M x = walker.map(activeRegion, null); return walker.reduce( x, sum ); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 194f61933..13add5e7d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -25,37 +25,105 @@ package org.broadinstitute.sting.utils.activeregion; -import com.google.java.contract.Requires; +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.List; /** - * Created by IntelliJ IDEA. + * Represents a single active region created by the Active Region Traversal for processing + * + * An active region is a single contiguous span of bases on the genome that should be operated + * on as a single unit for the active region traversal. The action may contains a list of + * reads that overlap the region (may because there may be no reads in the region). The region + * is tagged as being either active or inactive, depending on the probabilities provided by + * the isActiveProb results from the ART walker. Each region carries with it the + * exact span of the region (bases which are the core of the isActiveProbs from the walker) as + * well as an extended size, that includes the ART walker's extension size. Reads in the region + * provided by ART include all reads overlapping the extended span, not the raw span. + * * User: rpoplin * Date: 1/4/12 */ - +@Invariant({ + "extension >= 0", + "activeRegionLoc != null", + "genomeLocParser != null", + "spanIncludingReads != null", + "extendedLoc != null" +}) public class ActiveRegion implements HasGenomeLocation { - + /** + * The reads included in this active region. May be empty upon creation, and expand / contract + * as reads are added or removed from this region. + */ private final ArrayList reads = new ArrayList(); - private final List supportingStates; - private final GenomeLoc activeRegionLoc; - private final GenomeLoc extendedLoc; - private final int extension; - private GenomeLoc fullExtentReferenceLoc = null; - private final GenomeLocParser genomeLocParser; - public final boolean isActive; + /** + * An ordered list (by genomic coordinate) of the ActivityProfileStates that went + * into this active region. May be empty, which says that no supporting states were + * provided when this region was created. + */ + private final List supportingStates; + + /** + * The raw span of this active region, not including the active region extension + */ + private final GenomeLoc activeRegionLoc; + + /** + * The span of this active region on the genome, including the active region extension + */ + private final GenomeLoc extendedLoc; + + /** + * The extension, in bp, of this active region. + */ + private final int extension; + + /** + * A genomeLocParser so we can create genomeLocs + */ + private final GenomeLocParser genomeLocParser; + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + private final boolean isActive; + + /** + * The span of this active region, including the bp covered by all reads in this + * region. This union of extensionLoc and the loc of all reads in this region. + * + * Must be at least as large as extendedLoc, but may be larger when reads + * partially overlap this region. + */ + private GenomeLoc spanIncludingReads; + + /** + * Create a new ActiveRegion containing no reads + * + * @param activeRegionLoc the span of this active region + * @param supportingStates the states that went into creating this region, or null / empty if none are available. + * If not empty, must have exactly one state for each bp in activeRegionLoc + * @param isActive indicates whether this is an active region, or an inactve one + * @param genomeLocParser a non-null parser to let us create new genome locs + * @param extension the active region extension to use for this active region + */ public ActiveRegion( final GenomeLoc activeRegionLoc, final List supportingStates, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { if ( activeRegionLoc == null ) throw new IllegalArgumentException("activeRegionLoc cannot be null"); + if ( activeRegionLoc.size() == 0 ) throw new IllegalArgumentException("Active region cannot be of zero size, but got " + activeRegionLoc); if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); if ( extension < 0 ) throw new IllegalArgumentException("extension cannot be < 0 but got " + extension); @@ -64,75 +132,254 @@ public class ActiveRegion implements HasGenomeLocation { this.isActive = isActive; this.genomeLocParser = genomeLocParser; this.extension = extension; - extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); - fullExtentReferenceLoc = extendedLoc; + this.extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + this.spanIncludingReads = extendedLoc; + + if ( ! this.supportingStates.isEmpty() ) { + if ( this.supportingStates.size() != activeRegionLoc.size() ) + throw new IllegalArgumentException("Supporting states wasn't empty but it doesn't have exactly one state per bp in the active region: states " + this.supportingStates.size() + " vs. bp in region = " + activeRegionLoc.size()); + GenomeLoc lastStateLoc = null; + for ( final ActivityProfileState state : this.supportingStates ) { + if ( lastStateLoc != null ) { + if ( state.getLoc().getStart() != lastStateLoc.getStart() + 1 || state.getLoc().getContigIndex() != lastStateLoc.getContigIndex()) + throw new IllegalArgumentException("Supporting state has an invalid sequence: last state was " + lastStateLoc + " but next state was " + state); + } + lastStateLoc = state.getLoc(); + } + } } @Override public String toString() { - return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive + " nReads=" + reads.size() + " "; + return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size() + " "; } - // add each read to the bin and extend the reference genome activeRegionLoc if needed - public void add( final GATKSAMRecord read ) { - fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); - reads.add( read ); - } - - public void hardClipToActiveRegion() { - final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); - reads.clear(); - reads.addAll(clippedReads); - } - - public ArrayList getReads() { return reads; } - - @Requires("referenceReader.isUppercasingBases()") - public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) { + /** + * See #getActiveRegionReference but with padding == 0 + */ + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference( referenceReader, padding, extendedLoc ); + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, extendedLoc); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) { + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } - @Requires("referenceReader.isUppercasingBases()") - public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { - return getReference( referenceReader, padding, fullExtentReferenceLoc ); + /** + * See #getActiveRegionReference but using the span including regions not the extended span + */ + public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return getReference(referenceReader, padding, spanIncludingReads); } - @Requires("referenceReader.isUppercasingBases()") - private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + /** + * Get the reference bases from referenceReader spanned by the extended location of this active region, + * including additional padding bp on either side. If this expanded region would exceed the boundaries + * of the active region's contig, the returned result will be truncated to only include on-genome reference + * bases + * @param referenceReader the source of the reference genome bases + * @param padding the padding, in BP, we want to add to either side of this active region extended region + * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for + * @return a non-null array of bytes holding the reference bases in referenceReader + */ + @Ensures("result != null") + private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); + if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); + if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); + if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + return reference; } + /** + * Get the raw span of this active region (excluding the extension) + * @return a non-null genome loc + */ @Override + @Ensures("result != null") public GenomeLoc getLocation() { return activeRegionLoc; } + + /** + * Get the span of this active region including the extension value + * @return a non-null GenomeLoc + */ + @Ensures("result != null") public GenomeLoc getExtendedLoc() { return extendedLoc; } - public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } - public List getSupportingStates() { return supportingStates; } + /** + * Get the span of this active region including the extension and the projects on the + * genome of all reads in this active region. That is, returns the bp covered by this + * region and all reads in the region. + * @return a non-null genome loc + */ + @Ensures("result != null") + public GenomeLoc getReadSpanLoc() { return spanIncludingReads; } + /** + * Get the active profile states that went into creating this region, if possible + * @return an unmodifiable list of states that led to the creation of this region, or an empty + * list if none were provided + */ + @Ensures("result != null") + public List getSupportingStates() { + return Collections.unmodifiableList(supportingStates); + } + + /** + * Get the active region extension applied to this region + * + * The extension is >= 0 bp in size, and indicates how much padding this art walker wanted for its regions + * + * @return the size in bp of the region extension + */ + @Ensures("result >= 0") public int getExtension() { return extension; } - public int size() { return reads.size(); } - public void clearReads() { reads.clear(); } - public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } - public boolean equalExceptReads(final ActiveRegion other) { + /** + * Get an unmodifiable list of reads currently in this active region. + * + * The reads are sorted by their coordinate position + * + * @return an unmodifiable list of reads in this active region + */ + @Ensures("result != null") + public List getReads() { + return Collections.unmodifiableList(reads); + } + + /** + * Get the number of reads currently in this active region + * @return an integer >= 0 + */ + @Ensures("result >= 0") + public int size() { return reads.size(); } + + /** + * Add read to this active region + * + * Read must have alignment start >= than the last read currently in this active region. + * + * @throws IllegalArgumentException if read doesn't overlap the extended region of this active region + * + * @param read a non-null GATKSAMRecord + */ + @Ensures("reads.size() == old(reads.size()) + 1") + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + if ( ! readOverlapsRegion(read) ) + throw new IllegalArgumentException("Read location " + readLoc + " doesn't overlap with active region extended span " + extendedLoc); + + spanIncludingReads = spanIncludingReads.union( readLoc ); + + if ( ! reads.isEmpty() ) { + final GATKSAMRecord lastRead = reads.get(size() - 1); + if ( ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion not on the same contig as other reads: lastRead " + lastRead + " attempting to add " + read); + + if ( read.getAlignmentStart() < lastRead.getAlignmentStart() ) + throw new IllegalArgumentException("Attempting to add a read to ActiveRegion out of order w.r.t. other reads: lastRead " + lastRead + " at " + lastRead.getAlignmentStart() + " attempting to add " + read + " at " + read.getAlignmentStart()); + } + + reads.add( read ); + } + + /** + * Returns true if read would overlap the extended extent of this region + * @param read the read we want to test + * @return true if read can be added to this region, false otherwise + */ + public boolean readOverlapsRegion(final GATKSAMRecord read) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc( read ); + return readLoc.overlapsP(extendedLoc); + } + + /** + * Add all reads to this active region + * @param reads a collection of reads to add to this active region + */ + public void addAll(final Collection reads) { + if ( reads == null ) throw new IllegalArgumentException("reads cannot be null"); + for ( final GATKSAMRecord read : reads ) + add(read); + } + + /** + * Clear all of the reads currently in this active region + */ + @Ensures("size() == 0") + public void clearReads() { + spanIncludingReads = extendedLoc; + reads.clear(); + } + + /** + * Remove all of the reads in readsToRemove from this active region + * @param readsToRemove the collection of reads we want to remove + */ + public void removeAll( final Collection readsToRemove ) { + reads.removeAll(readsToRemove); + spanIncludingReads = extendedLoc; + for ( final GATKSAMRecord read : reads ) { + spanIncludingReads = spanIncludingReads.union( genomeLocParser.createGenomeLoc(read) ); + } + } + + /** + * Clips all of the reads in this active region so that none extend beyond the active region extended loc + * + * This function may change the getReadSpanLoc, as it updates the read span based on the new clipped + * read coordinates. + */ + public void hardClipToActiveRegion() { + final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); + ReadUtils.sortReadsByCoordinate(clippedReads); + clearReads(); + addAll(clippedReads); + } + + /** + * Is this region equal to other, excluding any reads in either region in the comparison + * @param other the other active region we want to test + * @return true if this region is equal, excluding any reads and derived values, to other + */ + protected boolean equalExceptReads(final ActiveRegion other) { if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; - if ( isActive != other.isActive ) return false; + if ( isActive() != other.isActive()) return false; if ( genomeLocParser != other.genomeLocParser ) return false; if ( extension != other.extension ) return false; if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } + + /** + * Does this region represent an active region (all isActiveProbs above threshold) or + * an inactive region (all isActiveProbs below threshold)? + */ + public boolean isActive() { + return isActive; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 0f5d6a2f7..1bf24814b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -115,6 +115,19 @@ public class ArtificialSAMUtils { return header; } + /** + * Creates an artificial sam header based on the sequence dictionary dict + * + * @return + */ + public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(dict); + return header; + } + + /** * setup a default read group for a SAMFileHeader * diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java new file mode 100644 index 000000000..d2ea5d11b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActiveRegionUnitTest.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.activeregion; + + +// the imports for unit testing. + + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class ActiveRegionUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; + private String contig; + private int contigLength; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + contig = "1"; + contigLength = genomeLocParser.getContigInfo(contig).getSequenceLength(); + } + + @DataProvider(name = "ActionRegionCreationTest") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) { + for ( final int size : Arrays.asList(1, 10, 100, 1000) ) { + for ( final int ext : Arrays.asList(0, 1, 10, 100) ) { + for ( final boolean isActive : Arrays.asList(true, false) ) { + for ( final boolean addStates : Arrays.asList(true, false) ) { + List states = null; + if ( addStates ) { + states = new LinkedList(); + for ( int i = start; i < start + size; i++ ) { + states.add(new ActivityProfileState(genomeLocParser.createGenomeLoc(contig, i + start), isActive ? 1.0 : 0.0)); + } + } + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + tests.add(new Object[]{loc, states, isActive, ext}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ActionRegionCreationTest") + public void testCreatingActiveRegions(final GenomeLoc loc, final List supportingStates, final boolean isActive, final int extension) { + final ActiveRegion region = new ActiveRegion(loc, supportingStates, isActive, genomeLocParser, extension); + Assert.assertEquals(region.getLocation(), loc); + Assert.assertEquals(region.getExtendedLoc().getStart(), Math.max(loc.getStart() - extension, 1)); + Assert.assertEquals(region.getExtendedLoc().getStop(), Math.min(loc.getStop() + extension, contigLength)); + Assert.assertEquals(region.getReadSpanLoc().getStart(), Math.max(loc.getStart() - extension, 1)); + Assert.assertEquals(region.getReadSpanLoc().getStop(), Math.min(loc.getStop() + extension, contigLength)); + Assert.assertEquals(region.isActive(), isActive); + Assert.assertEquals(region.getExtension(), extension); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getSupportingStates(), supportingStates == null ? Collections.emptyList() : supportingStates); + Assert.assertNotNull(region.toString()); + + assertGoodReferenceGetter(region.getActiveRegionReference(seq), region.getExtendedLoc(), 0); + assertGoodReferenceGetter(region.getActiveRegionReference(seq, 10), region.getExtendedLoc(), 10); + assertGoodReferenceGetter(region.getFullReference(seq), region.getReadSpanLoc(), 0); + assertGoodReferenceGetter(region.getFullReference(seq, 10), region.getReadSpanLoc(), 10); + } + + private void assertGoodReferenceGetter(final byte[] actualBytes, final GenomeLoc span, final int padding) { + final int expectedStart = Math.max(span.getStart() - padding, 1); + final int expectedStop = Math.min(span.getStop() + padding, contigLength); + final byte[] expectedBytes = seq.getSubsequenceAt(span.getContig(), expectedStart, expectedStop).getBases(); + Assert.assertEquals(actualBytes, expectedBytes); + } + + @DataProvider(name = "ActiveRegionReads") + public Object[][] makeActiveRegionReads() { + List tests = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + for ( final int start : Arrays.asList(1, 10, 100, contigLength - 10, contigLength - 1) ) { + for ( final int readStartOffset : Arrays.asList(-100, -10, 0, 10, 100) ) { + for ( final int readSize : Arrays.asList(10, 100, 1000) ) { + final GenomeLoc loc = genomeLocParser.createGenomeLocOnContig(contig, start, start + 10); + + final int readStart = Math.max(start + readStartOffset, 1); + final int readStop = Math.min(readStart + readSize, contigLength); + final int readLength = readStop - readStart + 1; + if ( readLength > 0 ) { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, readStart, readLength); + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + if ( readLoc.overlapsP(loc) ) + tests.add(new Object[]{loc, read}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ActiveRegionReads") + public void testActiveRegionReads(final GenomeLoc loc, final GATKSAMRecord read) { + final GenomeLoc expectedSpan = loc.union(genomeLocParser.createGenomeLoc(read)); + + final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); + final ActiveRegion region2 = new ActiveRegion(loc, null, true, genomeLocParser, 0); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.add(read); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.clearReads(); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.addAll(Collections.singleton(read)); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.removeAll(Collections.emptyList()); + Assert.assertEquals(region.getReads(), Collections.singletonList(read)); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), expectedSpan); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.removeAll(Collections.singletonList(read)); + Assert.assertEquals(region.getReads(), Collections.emptyList()); + Assert.assertEquals(region.size(), 0); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.equalExceptReads(region2)); + + region.add(read); + region.hardClipToActiveRegion(); + Assert.assertEquals(region.size(), 1); + Assert.assertEquals(region.getExtendedLoc(), loc); + Assert.assertEquals(region.getReadSpanLoc(), loc); + Assert.assertTrue(region.getReads().get(0).getAlignmentStart() >= region.getExtendedLoc().getStart()); + Assert.assertTrue(region.getReads().get(0).getAlignmentEnd() <= region.getExtendedLoc().getStop()); + } + + @DataProvider(name = "BadReadsTest") + public Object[][] makeBadReadsTest() { + List tests = new ArrayList(); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)}); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 1, 9, 10)}); + tests.add(new Object[]{ + ArtificialSAMUtils.createArtificialRead(header, "read1", 1, 10, 10), + ArtificialSAMUtils.createArtificialRead(header, "read2", 0, 9, 10)}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BadReadsTest", expectedExceptions = IllegalArgumentException.class) + public void testBadReads(final GATKSAMRecord read1, final GATKSAMRecord read2) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(read1); + final ActiveRegion region = new ActiveRegion(loc, null, true, genomeLocParser, 0); + region.add(read1); + region.add(read2); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index 1df4a3348..b9fdb3afe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -276,7 +276,7 @@ public class ActivityProfileUnitTest extends BaseTest { Assert.assertTrue(regionOffset >= 0 && regionOffset < probs.size(), "Region " + region + " has a bad offset w.r.t. start"); for ( int j = 0; j < region.getLocation().size(); j++ ) { final int siteOffset = j + regionOffset; - Assert.assertEquals(region.isActive, probs.get(siteOffset).booleanValue()); + Assert.assertEquals(region.isActive(), probs.get(siteOffset).booleanValue()); Assert.assertFalse(seenSites.get(siteOffset), "Site " + j + " in " + region + " was seen already"); seenSites.set(siteOffset, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index eb2eebd36..cb2a6bfb2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -309,8 +309,8 @@ public class BandPassActivityProfileUnitTest extends BaseTest { lastPosSeen = region.getLocation().getStop(); for ( final ActivityProfileState state : region.getSupportingStates() ) { - Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive, - "Region is active=" + region.isActive + " but contains a state " + state + " with prob " + Assert.assertEquals(state.isActiveProb > ActivityProfile.ACTIVE_PROB_THRESHOLD, region.isActive(), + "Region is active=" + region.isActive() + " but contains a state " + state + " with prob " + state.isActiveProb + " not within expected values given threshold for activity of " + ActivityProfile.ACTIVE_PROB_THRESHOLD); }