diff --git a/build.xml b/build.xml index dbdafa3d9..ffb564cd2 100644 --- a/build.xml +++ b/build.xml @@ -281,6 +281,10 @@ + + + + @@ -331,7 +335,7 @@ - + @@ -761,6 +765,7 @@ + @@ -811,7 +816,23 @@ - + + + + + + + + + + + + + + + + + @@ -852,6 +873,7 @@ + @@ -934,6 +956,9 @@ + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ede8e9340..6140d543a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -443,14 +443,22 @@ public class GenomeAnalysisEngine { if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) { + if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 774b532f3..16487054b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -77,7 +77,7 @@ public class LinearMicroScheduler extends MicroScheduler { done = walker.isDone(); } - // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine if( traversalEngine instanceof TraverseActiveRegions ) { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 384affcb7..ebfcc0c29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,13 +45,15 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); @@ -90,9 +91,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); workQueue.addAll( activeRegions ); - } - while( workQueue.peek().getLocation().getStop() < minStart ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + if( !workQueue.isEmpty() ) { + while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } + } } return sum; @@ -151,23 +166,27 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation()); + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker return walker.reduce( x, sum ); } @@ -178,8 +197,8 @@ public class TraverseActiveRegions extends TraversalEngine walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + private LocusView getLocusView( final Walker walker, final LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) @@ -190,24 +209,32 @@ public class TraverseActiveRegions extends TraversalEngine integrateActiveList( final ArrayList activeList ) { final ArrayList returnList = new ArrayList(); - ActiveRegion prevLocus = activeList.remove(0); - ActiveRegion startLocus = prevLocus; - for( final ActiveRegion thisLocus : activeList ) { - if( prevLocus.isActive != thisLocus.isActive ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - startLocus = thisLocus; + if( activeList.size() == 0 ) { + return returnList; + } else if( activeList.size() == 1 ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()), + activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) ); + return returnList; + } else { + ActiveRegion prevLocus = activeList.get(0); + ActiveRegion startLocus = prevLocus; + for( final ActiveRegion thisLocus : activeList ) { + if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + startLocus = thisLocus; + } + prevLocus = thisLocus; } - prevLocus = thisLocus; + // output the last region if necessary + if( startLocus != prevLocus ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + } + return returnList; } - // output the last region if necessary - if( startLocus != prevLocus ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - } - return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java new file mode 100644 index 000000000..bb007893c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.Documented; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Describes the size of the buffer region that is added to each active region when pulling in covered reads. + * User: rpoplin + * Date: 1/18/12 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) + +public @interface ActiveRegionExtension { + public int extension() default 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index d2891c959..d7e170d73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -1,13 +1,26 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.util.ArrayList; +import java.util.List; /** - * Created by IntelliJ IDEA. + * Base class for all the Active Region Walkers. * User: rpoplin * Date: 12/7/11 */ @@ -15,15 +28,33 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) +@ActiveRegionExtension(extension=50) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) public abstract class ActiveRegionWalker extends Walker { + // Do we actually want to operate on the context? public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { return true; // We are keeping all the reads } + public boolean wantsNonPrimaryReads() { + return false; + } + // Determine active status over the AlignmentContext public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); // Map over the ActiveRegion public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker); + + public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { + final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final List allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index c4025a25c..0dda02421 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -54,7 +54,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private static final double MIN_PVALUE = 1E-320; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( ! vc.isVariant() || vc.isFiltered() ) + if ( !vc.isVariant() ) return null; int[][] table; @@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( pvalue == null ) return null; - // use Math.abs to prevent -0's Map map = new HashMap(); map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index f827856be..ec67563dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -241,6 +241,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { String alleleA = beagleGenotypePairs.get(0); String alleleB = beagleGenotypePairs.get(1); + if ( alleleA.equals("null") || alleleB.equals("null") ) { + logger.warn("Beagle produced 'null' alleles at location "+ref.getLocus().toString()+". Ignoring."); + return 0; + } + // Beagle always produces genotype strings based on the strings we input in the likelihood file. String refString = vc_input.getReference().getDisplayString(); if (refString.length() == 0) // ref was null @@ -315,8 +320,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { og = a1+"/"+a2; // See if Beagle switched genotypes - if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || - (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){ + if (! originalAlleleA.equals(Allele.NO_CALL) && beagleSwitchedGenotypes(bglAlleleA,originalAlleleA,bglAlleleB,originalAlleleB)){ originalAttributes.put("OG",og); numGenotypesChangedByBeagle++; } @@ -359,6 +363,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 1; } + private boolean beagleSwitchedGenotypes(Allele bglAlleleA, Allele originalAlleleA, Allele bglAlleleB, Allele originalAlleleB) { + return !((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || + (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA)))); + } + public Integer reduceInit() { return 0; // Nothing to do here } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8278dbab7..7eb6fad54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -139,6 +139,12 @@ public class VariantFiltrationWalker extends RodWalker { @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; + /** + * Invalidate previous filters applied to the VariantContext, applying only the filters here + */ + @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) + boolean invalidatePrevious = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -215,6 +221,9 @@ public class VariantFiltrationWalker extends RodWalker { for ( VariantContext vc : VCs ) { + if ( invalidatePrevious ) { + vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); + } // filter based on previous mask position if ( previousMaskPosition != null && // we saw a previous mask site previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 88a9668cc..bdf25419f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -155,7 +155,9 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index 6d2ac643b..378765051 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.List; @@ -126,7 +125,7 @@ public class VariantRecalibratorEngine { iteration++; model.maximizationStep( data ); currentChangeInMixtureCoefficients = model.normalizePMixtureLog10(); - model.expectationStep(data); + model.expectationStep( data ); if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 096085330..af05c0dc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -105,7 +105,7 @@ public class CombineVariants extends RodWalker { * and each named argument will be labeled as such in the output (i.e., set=name rather than * set=variants2). The order of arguments does not matter unless except for the naming, so * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are techincally order dependent. It is strongly recommended to provide explicit names when + * are technically order dependent. It is strongly recommended to provide explicit names when * a rod priority list is provided. */ @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6941b888b..ad10b61e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -467,6 +467,6 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } public long sizeOfOverlap( final GenomeLoc that ) { - return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L ); + return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 26be0e59e..d11adf9e3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -127,6 +127,21 @@ public class GenomeLocSortedSet extends AbstractSet { return mArray.isEmpty(); } + /** + * Determine if the given loc overlaps any loc in the sorted set + * + * @param loc the location to test + * @return + */ + public boolean overlaps(final GenomeLoc loc) { + for(final GenomeLoc e : mArray) { + if(e.overlapsP(loc)) { + return true; + } + } + return false; + } + /** * add a genomeLoc to the collection, simply inserting in order into the set * diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRead.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRead.java deleted file mode 100644 index 8d08a29b6..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRead.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.broadinstitute.sting.utils.activeregion; - -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 1/4/12 - */ - -public class ActiveRead { - final public GATKSAMRecord read; - final public boolean isPrimaryRegion; - - ActiveRead( final GATKSAMRecord read, final boolean isPrimaryRegion ) { - this.read = read; - this.isPrimaryRegion = isPrimaryRegion; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index e8908480c..6279e0061 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -16,40 +16,49 @@ import java.util.ArrayList; public class ActiveRegion implements HasGenomeLocation { - private final ArrayList reads = new ArrayList(); - private byte[] reference = null; - private final GenomeLoc loc; - private GenomeLoc referenceLoc = null; + private final ArrayList reads = new ArrayList(); + private final GenomeLoc activeRegionLoc; + private final GenomeLoc extendedLoc; + private final int extension; + private GenomeLoc fullExtentReferenceLoc = null; private final GenomeLocParser genomeLocParser; public final boolean isActive; - public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) { - this.loc = loc; + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; - referenceLoc = loc; + this.extension = extension; + extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + fullExtentReferenceLoc = extendedLoc; } - // add each read to the bin and extend the reference genome loc if needed - public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { - referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); - reads.add( new ActiveRead(read, isPrimaryRegion) ); + // add each read to the bin and extend the reference genome activeRegionLoc if needed + public void add( final GATKSAMRecord read ) { + fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); + reads.add( read ); } - public ArrayList getReads() { return reads; } + public ArrayList getReads() { return reads; } public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { - // set up the reference if we haven't done so yet - if ( reference == null ) { - reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases(); - } - - return reference; + return getReference( referenceReader, 0 ); } - public GenomeLoc getLocation() { return loc; } - - public GenomeLoc getReferenceLocation() { return referenceLoc; } + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), + Math.max(1, fullExtentReferenceLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); + } + @Override + public GenomeLoc getLocation() { return activeRegionLoc; } + public GenomeLoc getExtendedLoc() { return extendedLoc; } + public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } + + public int getExtension() { return extension; } public int size() { return reads.size(); } + public void clearReads() { reads.clear(); } + public void remove( final GATKSAMRecord read ) { reads.remove( read ); } + public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index e5500ca21..68bf6dce8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -1,9 +1,15 @@ package org.broadinstitute.sting.utils.fragments; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -121,4 +127,63 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } + public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + final byte MIN_QUAL_BAD_OVERLAP = 16; + if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } + + GATKSAMRecord firstRead = overlappingPair.get(0); + GATKSAMRecord secondRead = overlappingPair.get(1); + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + firstRead = overlappingPair.get(1); + secondRead = overlappingPair.get(0); + } + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + } + if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { + return overlappingPair; // fragments contain indels so don't merge them + } + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + + final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); + final int numBases = firstReadStop + secondRead.getReadLength(); + final byte[] bases = new byte[numBases]; + final byte[] quals = new byte[numBases]; + final byte[] firstReadBases = firstRead.getReadBases(); + final byte[] firstReadQuals = firstRead.getBaseQualities(); + final byte[] secondReadBases = secondRead.getReadBases(); + final byte[] secondReadQuals = secondRead.getBaseQualities(); + for(int iii = 0; iii < firstReadStop; iii++) { + bases[iii] = firstReadBases[iii]; + quals[iii] = firstReadQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { + return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + } + bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); + quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + bases[iii] = secondReadBases[iii-firstReadStop]; + quals[iii] = secondReadQuals[iii-firstReadStop]; + } + + final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader()); + returnRead.setAlignmentStart(firstRead.getUnclippedStart()); + returnRead.setReadBases( bases ); + returnRead.setBaseQualities( quals ); + returnRead.setReadGroup( firstRead.getReadGroup() ); + returnRead.setReferenceName( firstRead.getReferenceName() ); + final CigarElement c = new CigarElement(bases.length, CigarOperator.M); + final ArrayList cList = new ArrayList(); + cList.add(c); + returnRead.setCigar( new Cigar( cList )); + returnRead.setMappingQuality( firstRead.getMappingQuality() ); + + final ArrayList returnList = new ArrayList(); + returnList.add(returnRead); + return returnList; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index cc0b1ae67..d1e3ce26b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -608,7 +608,7 @@ public class ReadUtils { * Example: Locus => {read1, read2, ..., readN} * * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. - * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. * Example: Read => {true, true, false, ... false} * * @param readList the list of reads to generate the association mappings diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index c9a4965c1..39045ea21 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -464,7 +464,11 @@ public class VariantContextUtils { /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ - KEEP_IF_ALL_UNFILTERED + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL } /** @@ -635,7 +639,7 @@ public class VariantContextUtils { } // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) filters.clear(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 1a01ef8e8..9aae1f0ae 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847")); + "-o %s -NO_HEADER", 1, Arrays.asList("6d0f213918e3b9ea33bc2f8a51a462f1")); executeTest("test BeagleOutputToVCF", spec); } @@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee")); + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("ddbf490f1d9f37cc79fe414c8d40886f")); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 32fa8679e..5cdf12f1b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("43e7a17d95b1a0cf72e669657794d802")); + Arrays.asList("1899bdb956c62bbcbf160b18cd3aea60")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("36ce53ae4319718ad9c8ae391deebc8c")); + Arrays.asList("320f61c87253aba77d6dc782242cba8b")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 3935c2138..3c7cd0a2d 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -26,23 +26,23 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e var cur : String = null if ( elems.hasNext ) { cur = elems.next - } else { - out.printf("%s%n",prev) - } - while ( elems.hasNext ) { - out.printf("%s%n",prev) - while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { - cur = elems.next - } - - if ( ! cur.equals(prev) ) { - if ( elems.hasNext ) { - prev = cur + while ( elems.hasNext ) { + out.printf("%s%n",prev) + while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { cur = elems.next - } else { - out.printf("%s%n",cur) + } + + if ( ! cur.equals(prev) ) { + if ( elems.hasNext ) { + prev = cur + cur = elems.next + } } } + out.printf("%s%n",prev) + out.printf("%s%n",cur) + } else { + out.printf("%s%n",prev) } out.close diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index 54e541142..3179c327f 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -6,7 +6,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.commandline._ import java.io.{PrintWriter, PrintStream, File} -class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { +class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList) @Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF