From 066da80a3d9a83ec5e59cd3e545c6c2bf102a625 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Thu, 19 Jan 2012 18:19:58 -0500 Subject: [PATCH 1/3] Added KEEP_UNCONDTIONAL option which permits even sites with only filtered records to be included as unfiltered sites in the output --- .../sting/utils/variantcontext/VariantContextUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index c9a4965c1..39045ea21 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -464,7 +464,11 @@ public class VariantContextUtils { /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ - KEEP_IF_ALL_UNFILTERED + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL } /** @@ -635,7 +639,7 @@ public class VariantContextUtils { } // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) filters.clear(); From ace93330688e2a55b1d6d81f2d4e4bba21698cb6 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 19 Jan 2012 22:05:08 -0500 Subject: [PATCH 3/3] Active region walkers can now see the reads in a buffer around thier active reigons. This buffer size is specified as a walker annotation. Intervals are internally extended by this buffer size so that the extra reads make their way through the traversal engine but the walker author only needs to see the original interval. Also, several corner case bug fixes in active region traversal. --- .../sting/gatk/GenomeAnalysisEngine.java | 12 ++- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../traversals/TraverseActiveRegions.java | 89 ++++++++++++------- .../gatk/walkers/ActiveRegionExtension.java | 19 ++++ .../gatk/walkers/ActiveRegionWalker.java | 29 +++++- .../gatk/walkers/annotator/FisherStrand.java | 1 - .../broadinstitute/sting/utils/GenomeLoc.java | 2 +- .../sting/utils/GenomeLocSortedSet.java | 15 ++++ .../utils/activeregion/ActiveRegion.java | 29 +++--- 9 files changed, 149 insertions(+), 49 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ede8e9340..6140d543a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -443,14 +443,22 @@ public class GenomeAnalysisEngine { if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) { + if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 774b532f3..16487054b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -77,7 +77,7 @@ public class LinearMicroScheduler extends MicroScheduler { done = walker.isDone(); } - // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine if( traversalEngine instanceof TraverseActiveRegions ) { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 384affcb7..b18d1ceb9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,13 +45,15 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); @@ -90,9 +91,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); workQueue.addAll( activeRegions ); - } - while( workQueue.peek().getLocation().getStop() < minStart ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + if( !workQueue.isEmpty() ) { + while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } + } } return sum; @@ -158,16 +173,18 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation()); + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker return walker.reduce( x, sum ); } @@ -178,8 +195,8 @@ public class TraverseActiveRegions extends TraversalEngine walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + private LocusView getLocusView( final Walker walker, final LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) @@ -193,21 +210,29 @@ public class TraverseActiveRegions extends TraversalEngine integrateActiveList( final ArrayList activeList ) { final ArrayList returnList = new ArrayList(); - ActiveRegion prevLocus = activeList.remove(0); - ActiveRegion startLocus = prevLocus; - for( final ActiveRegion thisLocus : activeList ) { - if( prevLocus.isActive != thisLocus.isActive ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - startLocus = thisLocus; + if( activeList.size() == 0 ) { + return returnList; + } else if( activeList.size() == 1 ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()), + activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) ); + return returnList; + } else { + ActiveRegion prevLocus = activeList.get(0); + ActiveRegion startLocus = prevLocus; + for( final ActiveRegion thisLocus : activeList ) { + if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + startLocus = thisLocus; + } + prevLocus = thisLocus; } - prevLocus = thisLocus; + // output the last region if necessary + if( startLocus != prevLocus ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + } + return returnList; } - // output the last region if necessary - if( startLocus != prevLocus ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - } - return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java new file mode 100644 index 000000000..bb007893c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.Documented; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Describes the size of the buffer region that is added to each active region when pulling in covered reads. + * User: rpoplin + * Date: 1/18/12 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) + +public @interface ActiveRegionExtension { + public int extension() default 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index d2891c959..8405f762d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -1,13 +1,26 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.util.ArrayList; +import java.util.List; /** - * Created by IntelliJ IDEA. + * Base class for all the Active Region Walkers. * User: rpoplin * Date: 12/7/11 */ @@ -15,7 +28,10 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) +@ActiveRegionExtension(extension=50) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) public abstract class ActiveRegionWalker extends Walker { + // Do we actually want to operate on the context? public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { return true; // We are keeping all the reads @@ -26,4 +42,15 @@ public abstract class ActiveRegionWalker extends Walker allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index c4025a25c..cb9bc103f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( pvalue == null ) return null; - // use Math.abs to prevent -0's Map map = new HashMap(); map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6941b888b..ad10b61e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -467,6 +467,6 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } public long sizeOfOverlap( final GenomeLoc that ) { - return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L ); + return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 26be0e59e..d11adf9e3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -127,6 +127,21 @@ public class GenomeLocSortedSet extends AbstractSet { return mArray.isEmpty(); } + /** + * Determine if the given loc overlaps any loc in the sorted set + * + * @param loc the location to test + * @return + */ + public boolean overlaps(final GenomeLoc loc) { + for(final GenomeLoc e : mArray) { + if(e.overlapsP(loc)) { + return true; + } + } + return false; + } + /** * add a genomeLoc to the collection, simply inserting in order into the set * diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index e8908480c..abf74469f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -18,21 +18,25 @@ public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private byte[] reference = null; - private final GenomeLoc loc; - private GenomeLoc referenceLoc = null; + private final GenomeLoc activeRegionLoc; + private final GenomeLoc extendedLoc; + private final int extension; + private GenomeLoc fullExtentReferenceLoc = null; private final GenomeLocParser genomeLocParser; public final boolean isActive; - public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) { - this.loc = loc; + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; - referenceLoc = loc; + this.extension = extension; + extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + fullExtentReferenceLoc = extendedLoc; } - // add each read to the bin and extend the reference genome loc if needed + // add each read to the bin and extend the reference genome activeRegionLoc if needed public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { - referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); + fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); reads.add( new ActiveRead(read, isPrimaryRegion) ); } @@ -41,15 +45,18 @@ public class ActiveRegion implements HasGenomeLocation { public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { // set up the reference if we haven't done so yet if ( reference == null ) { - reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases(); + reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases(); } return reference; } - public GenomeLoc getLocation() { return loc; } - - public GenomeLoc getReferenceLocation() { return referenceLoc; } + @Override + public GenomeLoc getLocation() { return activeRegionLoc; } + public GenomeLoc getExtendedLoc() { return extendedLoc; } + public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } + + public int getExtension() { return extension; } public int size() { return reads.size(); } } \ No newline at end of file