Bugfix for GSA-647 HaplotypeCaller misses good variant because the active region doesn't trigger for an exome

-- The logic for determining active regions was a bit broken in the HC when intervals were used in the system
-- TraverseActiveRegions now uses the AllLocus view, since we always want to see all reference sites, not just those covered.  Simplifies logic of TAR
-- Non-overlapping intervals are always treated as separate objects for determing active / inactive state.  This means that each exon will stand on its own when deciding if it should be active or inactive
-- Misc. cleanup, docs of some TAR infrastructure to make it safer and easier to debug in the future.
-- Committing the SingleExomeCalling script that I used to find this problem, and will continue to use in evaluating calling of a single exome with the HC
-- Make sure to get all of the reads into the set of potentially active reads, even for genomic locations that themselves don't overlap the engine intervals but may have reads that overlap the regions
-- Remove excessively expensive calls to check bases are upper cased in ReferenceContext
-- Update md5s after a lot of manual review and discussion with Ryan
This commit is contained in:
Mark DePristo 2012-10-30 16:58:55 -04:00
parent 9cd04c335c
commit 1444cd753b
9 changed files with 226 additions and 135 deletions

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
@ -212,7 +214,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
private VariantAnnotatorEngine annotationEngine;
// fasta reference reader to supplement the edges of the reference sequence
private IndexedFastaSequenceFile referenceReader;
private CachingIndexedFastaSequenceFile referenceReader;
// reference base padding size
private static final int REFERENCE_PADDING = 900;
@ -324,15 +326,15 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
}
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
return new ActivityProfileResult(1.0);
return new ActivityProfileResult(ref.getLocus(), 1.0);
}
}
if( USE_ALLELES_TRIGGER ) {
return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
}
if( context == null ) { return new ActivityProfileResult(0.0); }
if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); }
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
noCall.add(Allele.NO_CALL);
@ -369,7 +371,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() );
return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() );
}
//---------------------------------------------------------------------------------------------------------------

View File

@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d86fae2d1b504b422b7b0cfbbdecc2c4");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -69,8 +69,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f6326adfdf5bc147626b30a89ce06d56"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -78,7 +78,7 @@ public class ReferenceContext {
*
* @return
*/
@Ensures({"result != null", "BaseUtils.isUpperCase(result)"})
@Ensures({"result != null"})
public byte[] getBases();
}
@ -143,6 +143,9 @@ public class ReferenceContext {
private void fetchBasesFromProvider() {
if ( basesCache == null ) {
basesCache = basesProvider.getBases();
// must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time
assert BaseUtils.isUpperCase(basesCache);
}
}
@ -172,7 +175,6 @@ public class ReferenceContext {
* Get the base at the given locus.
* @return The base at the given locus from the reference.
*/
@Ensures("BaseUtils.isUpperCase(result)")
public byte getBase() {
return getBases()[(locus.getStart() - window.getStart())];
}
@ -182,7 +184,7 @@ public class ReferenceContext {
* @return All bases available. If the window is of size [0,0], the array will
* contain only the base at the given locus.
*/
@Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"})
@Ensures({"result != null", "result.length > 0"})
public byte[] getBases() {
fetchBasesFromProvider();
return basesCache;
@ -191,7 +193,7 @@ public class ReferenceContext {
/**
* All the bases in the window from the current base forward to the end of the window.
*/
@Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"})
@Ensures({"result != null", "result.length > 0"})
public byte[] getForwardBases() {
final byte[] bases = getBases();
final int mid = locus.getStart() - window.getStart();

View File

@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
@ -46,99 +45,127 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
T sum) {
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
int minStart = Integer.MAX_VALUE;
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
GenomeLoc location = locus.getLocation();
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
if(prevLoc != null) {
// fill in the active / inactive labels from the stop of the previous location to the start of this location
// TODO refactor to separate function
for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) {
final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii);
if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) {
profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ));
}
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
// TODO -- subsequent pass over them to find the ones overlapping the active regions
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals == null || initialIntervals.overlaps( location ) ) {
profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location));
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
prevLoc = location;
printProgress(locus.getLocation());
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
updateCumulativeMetrics(dataProvider.getShard());
// skip this location -- it's not part of our engine intervals
// TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow
if ( outsideEngineIntervals(location) )
continue;
// Take the individual isActive calls and integrate them into contiguous active regions and
// add these blocks of work to the work queue
// band-pass filter the list of isActive probabilities and turn into active regions
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
prevLoc = location;
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
return sum;
}
/**
* Is the loc outside of the intervals being requested for processing by the GATK?
* @param loc
* @return
*/
private boolean outsideEngineIntervals(final GenomeLoc loc) {
return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc);
}
/**
* Take the individual isActive calls and integrate them into contiguous active regions and
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*
* @param profile
* @param activeRegions
* @param activeRegionExtension
* @param maxRegionSize
* @return
*/
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
final List<ActiveRegion> activeRegions,
final int activeRegionExtension,
final int maxRegionSize) {
if ( profile.isEmpty() )
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
}
// --------------------------------------------------------------------------------
//
@ -150,7 +177,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker, final ReferenceContext refContext,
final AlignmentContext locus, final GenomeLoc location) {
if ( walker.hasPresetActiveRegions() ) {
return new ActivityProfileResult(walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
} else {
return walker.isActive( tracker, refContext, locus );
}
@ -250,30 +277,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
return walker.reduce( x, sum );
}
// --------------------------------------------------------------------------------
//
// engine interaction code
//
// --------------------------------------------------------------------------------
/**
* Gets the best view of loci for this walker given the available data.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
return new AllLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
return new RodLocusView(dataProvider);
else
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine

View File

@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
// note the linear probability scale
return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1));
return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
}

View File

@ -1,11 +1,11 @@
package org.broadinstitute.sting.utils.activeregion;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.util.StringUtil;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList;
@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation {
public ArrayList<GATKSAMRecord> getReads() { return reads; }
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) {
return getActiveRegionReference(referenceReader, 0);
}
public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
return getReference( referenceReader, padding, extendedLoc );
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) {
return getFullReference(referenceReader, 0);
}
public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) {
@Requires("referenceReader.isUppercasingBases()")
public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) {
return getReference( referenceReader, padding, fullExtentReferenceLoc );
}
private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
@Requires("referenceReader.isUppercasingBases()")
private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) {
final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(),
Math.max(1, genomeLoc.getStart() - padding),
Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases();
StringUtil.toUpperCase(reference);
return reference;
}

View File

@ -24,11 +24,11 @@
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.Collections;
@ -45,6 +45,7 @@ public class ActivityProfile {
final GenomeLocParser parser;
final boolean presetRegions;
GenomeLoc regionStartLoc = null;
GenomeLoc regionStopLoc = null;
final List<ActivityProfileResult> isActiveList;
private static final int FILTER_SIZE = 80;
private static final double[] GaussianKernel;
@ -71,19 +72,49 @@ public class ActivityProfile {
this.regionStartLoc = regionStartLoc;
}
public void add(final GenomeLoc loc, final ActivityProfileResult result) {
if ( loc.size() != 1 )
throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" );
isActiveList.add(result);
if( regionStartLoc == null ) {
@Override
public String toString() {
return "ActivityProfile{" +
"start=" + regionStartLoc +
", stop=" + regionStopLoc +
'}';
}
/**
* Add the next ActivityProfileResult to this profile.
*
* Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown
*
* @param result a well-formed ActivityProfileResult result to incorporate into this profile
*/
@Requires("result != null")
public void add(final ActivityProfileResult result) {
final GenomeLoc loc = result.getLoc();
if ( regionStartLoc == null ) {
regionStartLoc = loc;
regionStopLoc = loc;
} else {
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
regionStopLoc = loc;
}
isActiveList.add(result);
}
public int size() {
return isActiveList.size();
}
public boolean isEmpty() {
return isActiveList.isEmpty();
}
public boolean hasPresetRegions() {
return presetRegions;
}
/**
* Band pass this ActivityProfile, producing a new profile that's band pass filtered
* @return a new ActivityProfile that's the band-pass filtered version of this profile
@ -104,14 +135,21 @@ public class ActivityProfile {
}
iii++;
}
final double[] filteredProbArray = new double[activeProbArray.length];
final double[] filteredProbArray;
if( !presetRegions ) {
// if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray
filteredProbArray = new double[activeProbArray.length];
for( iii = 0; iii < activeProbArray.length; iii++ ) {
final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
}
} else {
// otherwise we simply use the activeProbArray directly
filteredProbArray = activeProbArray;
}
iii = 0;
for( final double prob : filteredProbArray ) {
final ActivityProfileResult result = isActiveList.get(iii++);
@ -119,6 +157,7 @@ public class ActivityProfile {
result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE;
result.resultValue = null;
}
return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc);
}
@ -166,6 +205,7 @@ public class ActivityProfile {
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
}
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
if( !isActive || curEnd - curStart < maxRegionSize ) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);

View File

@ -1,12 +1,16 @@
package org.broadinstitute.sting.utils.activeregion;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 7/27/12
*/
public class ActivityProfileResult {
private GenomeLoc loc;
public double isActiveProb;
public ActivityProfileResultState resultState;
public Number resultValue;
@ -16,16 +20,52 @@ public class ActivityProfileResult {
HIGH_QUALITY_SOFT_CLIPS
}
public ActivityProfileResult( final double isActiveProb ) {
this.isActiveProb = isActiveProb;
this.resultState = ActivityProfileResultState.NONE;
this.resultValue = null;
/**
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb
*
* @param loc the position of the result profile (for debugging purposes)
* @param isActiveProb the probability of being active (between 0 and 1)
*/
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) {
this(loc, isActiveProb, ActivityProfileResultState.NONE, null);
}
public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
/**
* Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some
* information about the result state and value (TODO RYAN -- what do these mean?)
*
* @param loc the position of the result profile (for debugging purposes)
* @param isActiveProb the probability of being active (between 0 and 1)
*/
@Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"})
public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) {
// make sure the location of that activity profile is 1
if ( loc.size() != 1 )
throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc);
this.loc = loc;
this.isActiveProb = isActiveProb;
this.resultState = resultState;
this.resultValue = resultValue;
}
/**
* Get the genome loc associated with the ActivityProfileResult
* @return the location of this result
*/
@Ensures("result != null")
public GenomeLoc getLoc() {
return loc;
}
@Override
public String toString() {
return "ActivityProfileResult{" +
"loc=" + loc +
", isActiveProb=" + isActiveProb +
", resultState=" + resultState +
", resultValue=" + resultValue +
'}';
}
}

View File

@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest {
for ( int i = 0; i < cfg.probs.size(); i++ ) {
double p = cfg.probs.get(i);
GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i);
profile.add(loc, new ActivityProfileResult(p));
profile.add(new ActivityProfileResult(loc, p));
}
Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() ));