Merge branch 'master' of ssh://chartl@tin.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Christopher Hartl 2012-01-20 12:44:22 -05:00
commit 3fe73f155c
10 changed files with 155 additions and 51 deletions

View File

@ -443,14 +443,22 @@ public class GenomeAnalysisEngine {
if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM)
throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available.");
if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) {
if(walker instanceof LocusWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
}
else if(walker instanceof ActiveRegionWalker) {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
}
else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) {
// Apply special validation to read pair walkers.
if(walker instanceof ReadPairWalker) {

View File

@ -77,7 +77,7 @@ public class LinearMicroScheduler extends MicroScheduler {
done = walker.isDone();
}
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine
// Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine
if( traversalEngine instanceof TraverseActiveRegions ) {
final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit());
accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator

View File

@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -46,13 +45,15 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
T sum) {
logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider));
LocusView locusView = getLocusView( walker, dataProvider );
final LocusView locusView = getLocusView( walker, dataProvider );
final GenomeLocSortedSet initialIntervals = engine.getIntervals();
int minStart = Integer.MAX_VALUE;
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
final ArrayList<ActiveRegion> isActiveList = new ArrayList<ActiveRegion>();
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
@ -90,9 +91,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser()) );
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
// Grab all the previously unseen reads from this pileup and add them to the massive read list
for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead();
@ -101,11 +104,20 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
}
// If this is the last pileup for this shard then need to calculate the minimum alignment start so that
// we know which active regions in the work queue are now safe to process
// If this is the last pileup for this shard then need to first do a special walker.isActive() call
// and then calculate the minimum alignment start so that we know which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) {
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) {
final boolean isActive = walker.isActive( tracker, refContext, locus );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
}
}
@ -117,11 +129,14 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
workQueue.addAll( activeRegions );
}
while( workQueue.peek().getLocation().getStop() < minStart ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
if( !workQueue.isEmpty() ) {
while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
}
}
}
return sum;
@ -158,16 +173,18 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
activeRegion.add( (GATKSAMRecord) read, false );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read, false );
}
}
placedReads.add( read );
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read, false );
}
}
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation());
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker
return walker.reduce( x, sum );
}
@ -178,8 +195,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
private LocusView getLocusView( final Walker<M,T> walker, final LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
@ -193,21 +210,29 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// integrate active regions into contiguous chunks based on active status
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<ActiveRegion> activeList ) {
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
ActiveRegion prevLocus = activeList.remove(0);
ActiveRegion startLocus = prevLocus;
for( final ActiveRegion thisLocus : activeList ) {
if( prevLocus.isActive != thisLocus.isActive ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser() ) );
startLocus = thisLocus;
if( activeList.size() == 0 ) {
return returnList;
} else if( activeList.size() == 1 ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()),
activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) );
return returnList;
} else {
ActiveRegion prevLocus = activeList.get(0);
ActiveRegion startLocus = prevLocus;
for( final ActiveRegion thisLocus : activeList ) {
if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
startLocus = thisLocus;
}
prevLocus = thisLocus;
}
prevLocus = thisLocus;
// output the last region if necessary
if( startLocus != prevLocus ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) );
}
return returnList;
}
// output the last region if necessary
if( startLocus != prevLocus ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()),
prevLocus.isActive, engine.getGenomeLocParser() ) );
}
return returnList;
}
}

View File

@ -0,0 +1,19 @@
package org.broadinstitute.sting.gatk.walkers;
import java.lang.annotation.Documented;
import java.lang.annotation.Inherited;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
/**
* Describes the size of the buffer region that is added to each active region when pulling in covered reads.
* User: rpoplin
* Date: 1/18/12
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
public @interface ActiveRegionExtension {
public int extension() default 0;
}

View File

@ -1,13 +1,26 @@
package org.broadinstitute.sting.gatk.walkers;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* Base class for all the Active Region Walkers.
* User: rpoplin
* Date: 12/7/11
*/
@ -15,7 +28,10 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
@By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
return true; // We are keeping all the reads
@ -26,4 +42,15 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
// Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);
public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) {
final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
for( final GenomeLoc interval : intervals.toList() ) {
final int start = Math.max( 1, interval.getStart() - activeRegionExtension );
final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension );
allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) );
}
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL);
}
}

View File

@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
if ( pvalue == null )
return null;
// use Math.abs to prevent -0's
Map<String, Object> map = new HashMap<String, Object>();
map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue)));
return map;

View File

@ -467,6 +467,6 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
}
public long sizeOfOverlap( final GenomeLoc that ) {
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L );
return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L );
}
}

View File

@ -127,6 +127,21 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return mArray.isEmpty();
}
/**
* Determine if the given loc overlaps any loc in the sorted set
*
* @param loc the location to test
* @return
*/
public boolean overlaps(final GenomeLoc loc) {
for(final GenomeLoc e : mArray) {
if(e.overlapsP(loc)) {
return true;
}
}
return false;
}
/**
* add a genomeLoc to the collection, simply inserting in order into the set
*

View File

@ -18,21 +18,25 @@ public class ActiveRegion implements HasGenomeLocation {
private final ArrayList<ActiveRead> reads = new ArrayList<ActiveRead>();
private byte[] reference = null;
private final GenomeLoc loc;
private GenomeLoc referenceLoc = null;
private final GenomeLoc activeRegionLoc;
private final GenomeLoc extendedLoc;
private final int extension;
private GenomeLoc fullExtentReferenceLoc = null;
private final GenomeLocParser genomeLocParser;
public final boolean isActive;
public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) {
this.loc = loc;
public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) {
this.activeRegionLoc = activeRegionLoc;
this.isActive = isActive;
this.genomeLocParser = genomeLocParser;
referenceLoc = loc;
this.extension = extension;
extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension);
fullExtentReferenceLoc = extendedLoc;
}
// add each read to the bin and extend the reference genome loc if needed
// add each read to the bin and extend the reference genome activeRegionLoc if needed
public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) {
referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) );
reads.add( new ActiveRead(read, isPrimaryRegion) );
}
@ -41,15 +45,18 @@ public class ActiveRegion implements HasGenomeLocation {
public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) {
// set up the reference if we haven't done so yet
if ( reference == null ) {
reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases();
reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases();
}
return reference;
}
public GenomeLoc getLocation() { return loc; }
public GenomeLoc getReferenceLocation() { return referenceLoc; }
@Override
public GenomeLoc getLocation() { return activeRegionLoc; }
public GenomeLoc getExtendedLoc() { return extendedLoc; }
public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; }
public int getExtension() { return extension; }
public int size() { return reads.size(); }
}

View File

@ -464,7 +464,11 @@ public class VariantContextUtils {
/**
* Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this.
*/
KEEP_IF_ALL_UNFILTERED
KEEP_IF_ALL_UNFILTERED,
/**
* If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset.
*/
KEEP_UNCONDITIONAL
}
/**
@ -635,7 +639,7 @@ public class VariantContextUtils {
}
// if at least one record was unfiltered and we want a union, clear all of the filters
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL )
filters.clear();