diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 22d23f216..76c1ce8c5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -47,6 +47,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process workQueue.addAll( activeRegions ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java index bb007893c..d27148884 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy; public @interface ActiveRegionExtension { public int extension() default 0; + public int maxRegion() default 1500; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 8ff4b2f6f..f217268d2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; -import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; +import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -33,8 +30,8 @@ import java.util.List; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) -@ActiveRegionExtension(extension=50) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@ActiveRegionExtension(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 37822dc84..764be2ac7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -15,7 +15,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation { +public class ActiveRegion implements HasGenomeLocation, Comparable { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation { Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } + @Override + public int compareTo( final ActiveRegion other ) { + return this.getLocation().compareTo(other.getLocation()); + } + @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 1499f639d..6ef5a2af2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,8 +24,10 @@ package org.broadinstitute.sting.utils.activeregion; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; @@ -45,8 +47,16 @@ public class ActivityProfile { final boolean presetRegions; GenomeLoc regionStartLoc = null; final List isActiveList; - private GenomeLoc lastLoc = null; + private static final int FILTER_SIZE = 65; + private static final Double[] GaussianKernel; + + static { + GaussianKernel = new Double[2*FILTER_SIZE + 1]; + for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) { + GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii); + } + } // todo -- add upfront the start and stop of the intervals // todo -- check that no regions are unexpectedly missing @@ -85,15 +95,13 @@ public class ActivityProfile { public ActivityProfile bandPassFilter() { final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); final Double[] filteredProbArray = new Double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + if( !presetRegions ) { + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); + final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); + filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } - filteredProbArray[iii] = maxVal; } - return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); } @@ -102,9 +110,9 @@ public class ActivityProfile { * @param activeRegionExtension * @return */ - public List createActiveRegions( final int activeRegionExtension ) { - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author - final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author + public List createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) { + final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author + final ArrayList returnList = new ArrayList(); if( isActiveList.size() == 0 ) { // no elements in the active list, just return an empty one @@ -112,25 +120,22 @@ public class ActivityProfile { } else if( isActiveList.size() == 1 ) { // there's a single element, it's either active or inactive boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; - final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); - return Collections.singletonList(region); + returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize)); } else { // there are 2+ elements, divide these up into regions - final ArrayList returnList = new ArrayList(); boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; int curStart = 0; for(int iii = 1; iii < isActiveList.size(); iii++ ) { final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; - if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); + if( isActive != thisStatus ) { + returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize)); isActive = thisStatus; curStart = iii; } } - returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region - - return returnList; + returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region } + return returnList; } /** @@ -141,8 +146,25 @@ public class ActivityProfile { * @param activeRegionExtension * @return a fully initialized ActiveRegion with the above properties */ - private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { - final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); - return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { + return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { + if( !isActive || curEnd - curStart < maxRegionSize ) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension)); + return returnList; + } + // find the best place to break up the large active region + Double minProb = Double.MAX_VALUE; + int cutPoint = -1; + for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45 + if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; } + } + final List leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList()); + final List rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); + returnList.addAll( leftList ); + returnList.addAll( rightList ); + return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 771721169..81ba00888 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -32,8 +32,6 @@ public class PileupElement implements Comparable { protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases - - /** * Creates a new pileup element. * diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java index 44cf87b45..7d1fc637b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/activeregionqc/CountReadsInActiveRegionsIntegrationTest.java @@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", 1, - Arrays.asList("fcd581aa6befe85c7297509fa7b34edf")); + Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2")); executeTest("CountReadsInActiveRegions:", spec); } } \ No newline at end of file