First version of walker that combines the functionality of IndelIntervalWalker, MismatchIntervalWalker, SNPClusterWalker, and IntervalMergerWalker - plus it allows the user to input rods containing known indels (e.g. dbSNP or 1KG calls) for automatic cleaning. Basically, all pre-processing steps for cleaning are now done in a single pass.

More testing needed.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2672 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-01-24 05:32:38 +00:00
parent d6b9b788a8
commit 78890c0bee
1 changed files with 230 additions and 0 deletions

View File

@ -0,0 +1,230 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads.
*/
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
public class RealignerTargetCreator extends LocusWalker<RealignerTargetCreator.Event, RealignerTargetCreator.Event> {
// mismatch/entropy arguments
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
protected int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
protected double mismatchThreshold = 0.15;
// observed indels arguments
@Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false)
int minIntervalIndelCount = 1;
// interval merging arguments
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false)
int maxIntervalSize = 500;
private final int minReadsAtInterval = 4;
@Override
public boolean generateExtendedEvents() { return true; }
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
public void initialize() {
if ( windowSize < 2 )
throw new StingException("Window Size must be an integer greater than 1");
}
public Event map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
boolean hasIndel = false;
boolean hasInsertion = false;
boolean hasPointEvent = false;
long furthestStopPos = -1;
// look for insertions in the extended context (we'll get deletions from the normal context)
if ( context.hasExtendedEventPileup() ) {
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup();
if ( pileup.getNumberOfInsertions() > 0 ) {
hasIndel = hasInsertion = true;
// check the ends of the reads to see how far they extend
for (ExtendedEventPileupElement p : pileup )
furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd());
}
}
// look at the rods for indels or SNPs
if ( tracker != null ) {
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
while ( rods.hasNext() ) {
ReferenceOrderedDatum rod = rods.next();
if ( rod instanceof VariationRod ) {
if ( ((VariationRod)rod).isIndel() ) {
hasIndel = true;
if ( ((VariationRod)rod).isInsertion() )
hasInsertion = true;
}
if ( ((VariationRod)rod).isSNP() )
hasPointEvent = true;
}
}
}
// look at the normal context to get deletions and positions with high entropy
ReadBackedPileup pileup = context.getBasePileup();
if ( pileup != null ) {
int mismatchQualities = 0, totalQualities = 0;
char upperRef = Character.toUpperCase(ref.getBase());
for (PileupElement p : pileup ) {
// check the ends of the reads to see how far they extend
SAMRecord read = p.getRead();
furthestStopPos = Math.max(furthestStopPos, read.getAlignmentEnd());
// is it a deletion? (sanity check in case extended event missed it)
if ( p.isDeletion() ) {
hasIndel = true;
}
// look for mismatches
else {
if ( Character.toUpperCase(p.getBase()) != upperRef )
mismatchQualities += p.getQual();
totalQualities += p.getQual();
}
}
// make sure we're supposed to look for high entropy
if ( mismatchThreshold > 0.0 &&
mismatchThreshold <= 1.0 &&
pileup.size() >= minReadsAtInterval &&
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
hasPointEvent = true;
}
if ( !hasIndel && !hasPointEvent )
return null;
GenomeLoc eventLoc = context.getLocation();
if ( hasInsertion )
eventLoc = GenomeLocParser.createGenomeLoc(eventLoc.getContigIndex(), eventLoc.getStart(), eventLoc.getStart()+1);
EVENT_TYPE eventType = (hasIndel ? (hasPointEvent ? EVENT_TYPE.BOTH : EVENT_TYPE.INDEL_EVENT) : EVENT_TYPE.POINT_EVENT);
return new Event(eventLoc, furthestStopPos, eventType);
}
public void onTraversalDone(Event sum) {
if ( sum != null && sum.isReportableEvent() )
out.println(sum.toString());
}
public Event reduceInit() {
return null;
}
public Event reduce(Event value, Event sum) {
// ignore no new events
if ( value == null )
return sum;
// if it's the first good value, use it
if ( sum == null )
return value;
// if we hit a new contig or they have no overlapping reads, then they are separate events - so clear sum
if ( sum.loc.getContigIndex() != value.loc.getContigIndex() || sum.furthestStopPos < value.loc.getStart() ) {
if ( sum.isReportableEvent() )
out.println(sum.toString());
return value;
}
// otherwise, merge the two events
sum.merge(value);
return sum;
}
private enum EVENT_TYPE { POINT_EVENT, INDEL_EVENT, BOTH }
class Event {
public long furthestStopPos;
public GenomeLoc loc;
public long eventStartPos;
private long eventStopPos;
private EVENT_TYPE type;
private ArrayList<Long> pointEvents = new ArrayList<Long>();
public Event(GenomeLoc loc, long furthestStopPos, EVENT_TYPE type) {
this.loc = loc;
this.furthestStopPos = furthestStopPos;
this.type = type;
if ( type == EVENT_TYPE.INDEL_EVENT || type == EVENT_TYPE.BOTH ) {
eventStartPos = loc.getStart();
eventStopPos = loc.getStop();
} else {
eventStartPos = -1;
eventStopPos = -1;
}
if ( type == EVENT_TYPE.POINT_EVENT || type == EVENT_TYPE.BOTH ) {
pointEvents.add(loc.getStart());
}
}
public void merge(Event e) {
// merges only get called for events with certain types
if ( e.type == EVENT_TYPE.INDEL_EVENT || e.type == EVENT_TYPE.BOTH ) {
if ( eventStartPos == -1 )
eventStartPos = e.eventStartPos;
eventStopPos = e.eventStopPos;
furthestStopPos = e.furthestStopPos;
}
if ( e.type == EVENT_TYPE.POINT_EVENT || e.type == EVENT_TYPE.BOTH ) {
long newPosition = e.pointEvents.get(0);
if ( pointEvents.size() > 0 ) {
long lastPosition = pointEvents.get(pointEvents.size()-1);
if ( newPosition - lastPosition < windowSize ) {
eventStopPos = Math.max(eventStopPos, newPosition);
furthestStopPos = e.furthestStopPos;
if ( eventStartPos == -1 )
eventStartPos = lastPosition;
else
eventStartPos = Math.min(eventStartPos, lastPosition);
}
}
pointEvents.add(newPosition);
}
}
public boolean isReportableEvent() {
return eventStartPos >= 0 && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize;
}
public String toString() {
return String.format("%s:%d-%d", loc.getContig(), eventStartPos, eventStopPos);
}
}
}