First version of walker that combines the functionality of IndelIntervalWalker, MismatchIntervalWalker, SNPClusterWalker, and IntervalMergerWalker - plus it allows the user to input rods containing known indels (e.g. dbSNP or 1KG calls) for automatic cleaning. Basically, all pre-processing steps for cleaning are now done in a single pass.
More testing needed. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2672 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d6b9b788a8
commit
78890c0bee
|
|
@ -0,0 +1,230 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||||
|
|
||||||
|
import net.sf.samtools.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads.
|
||||||
|
*/
|
||||||
|
@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class})
|
||||||
|
public class RealignerTargetCreator extends LocusWalker<RealignerTargetCreator.Event, RealignerTargetCreator.Event> {
|
||||||
|
|
||||||
|
// mismatch/entropy arguments
|
||||||
|
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
|
||||||
|
protected int windowSize = 10;
|
||||||
|
|
||||||
|
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
|
||||||
|
protected double mismatchThreshold = 0.15;
|
||||||
|
|
||||||
|
|
||||||
|
// observed indels arguments
|
||||||
|
@Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false)
|
||||||
|
int minIntervalIndelCount = 1;
|
||||||
|
|
||||||
|
|
||||||
|
// interval merging arguments
|
||||||
|
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false)
|
||||||
|
int maxIntervalSize = 500;
|
||||||
|
|
||||||
|
|
||||||
|
private final int minReadsAtInterval = 4;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean generateExtendedEvents() { return true; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
if ( windowSize < 2 )
|
||||||
|
throw new StingException("Window Size must be an integer greater than 1");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Event map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
|
||||||
|
boolean hasIndel = false;
|
||||||
|
boolean hasInsertion = false;
|
||||||
|
boolean hasPointEvent = false;
|
||||||
|
|
||||||
|
long furthestStopPos = -1;
|
||||||
|
|
||||||
|
// look for insertions in the extended context (we'll get deletions from the normal context)
|
||||||
|
if ( context.hasExtendedEventPileup() ) {
|
||||||
|
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup();
|
||||||
|
if ( pileup.getNumberOfInsertions() > 0 ) {
|
||||||
|
hasIndel = hasInsertion = true;
|
||||||
|
// check the ends of the reads to see how far they extend
|
||||||
|
for (ExtendedEventPileupElement p : pileup )
|
||||||
|
furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// look at the rods for indels or SNPs
|
||||||
|
if ( tracker != null ) {
|
||||||
|
Iterator<ReferenceOrderedDatum> rods = tracker.getAllRods().iterator();
|
||||||
|
while ( rods.hasNext() ) {
|
||||||
|
ReferenceOrderedDatum rod = rods.next();
|
||||||
|
if ( rod instanceof VariationRod ) {
|
||||||
|
if ( ((VariationRod)rod).isIndel() ) {
|
||||||
|
hasIndel = true;
|
||||||
|
if ( ((VariationRod)rod).isInsertion() )
|
||||||
|
hasInsertion = true;
|
||||||
|
}
|
||||||
|
if ( ((VariationRod)rod).isSNP() )
|
||||||
|
hasPointEvent = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// look at the normal context to get deletions and positions with high entropy
|
||||||
|
ReadBackedPileup pileup = context.getBasePileup();
|
||||||
|
if ( pileup != null ) {
|
||||||
|
|
||||||
|
int mismatchQualities = 0, totalQualities = 0;
|
||||||
|
char upperRef = Character.toUpperCase(ref.getBase());
|
||||||
|
for (PileupElement p : pileup ) {
|
||||||
|
// check the ends of the reads to see how far they extend
|
||||||
|
SAMRecord read = p.getRead();
|
||||||
|
furthestStopPos = Math.max(furthestStopPos, read.getAlignmentEnd());
|
||||||
|
|
||||||
|
// is it a deletion? (sanity check in case extended event missed it)
|
||||||
|
if ( p.isDeletion() ) {
|
||||||
|
hasIndel = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// look for mismatches
|
||||||
|
else {
|
||||||
|
if ( Character.toUpperCase(p.getBase()) != upperRef )
|
||||||
|
mismatchQualities += p.getQual();
|
||||||
|
totalQualities += p.getQual();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// make sure we're supposed to look for high entropy
|
||||||
|
if ( mismatchThreshold > 0.0 &&
|
||||||
|
mismatchThreshold <= 1.0 &&
|
||||||
|
pileup.size() >= minReadsAtInterval &&
|
||||||
|
(double)mismatchQualities / (double)totalQualities >= mismatchThreshold )
|
||||||
|
hasPointEvent = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( !hasIndel && !hasPointEvent )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
GenomeLoc eventLoc = context.getLocation();
|
||||||
|
if ( hasInsertion )
|
||||||
|
eventLoc = GenomeLocParser.createGenomeLoc(eventLoc.getContigIndex(), eventLoc.getStart(), eventLoc.getStart()+1);
|
||||||
|
|
||||||
|
EVENT_TYPE eventType = (hasIndel ? (hasPointEvent ? EVENT_TYPE.BOTH : EVENT_TYPE.INDEL_EVENT) : EVENT_TYPE.POINT_EVENT);
|
||||||
|
|
||||||
|
return new Event(eventLoc, furthestStopPos, eventType);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Event sum) {
|
||||||
|
if ( sum != null && sum.isReportableEvent() )
|
||||||
|
out.println(sum.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Event reduceInit() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Event reduce(Event value, Event sum) {
|
||||||
|
// ignore no new events
|
||||||
|
if ( value == null )
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
// if it's the first good value, use it
|
||||||
|
if ( sum == null )
|
||||||
|
return value;
|
||||||
|
|
||||||
|
// if we hit a new contig or they have no overlapping reads, then they are separate events - so clear sum
|
||||||
|
if ( sum.loc.getContigIndex() != value.loc.getContigIndex() || sum.furthestStopPos < value.loc.getStart() ) {
|
||||||
|
if ( sum.isReportableEvent() )
|
||||||
|
out.println(sum.toString());
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, merge the two events
|
||||||
|
sum.merge(value);
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
private enum EVENT_TYPE { POINT_EVENT, INDEL_EVENT, BOTH }
|
||||||
|
|
||||||
|
class Event {
|
||||||
|
public long furthestStopPos;
|
||||||
|
|
||||||
|
public GenomeLoc loc;
|
||||||
|
public long eventStartPos;
|
||||||
|
private long eventStopPos;
|
||||||
|
private EVENT_TYPE type;
|
||||||
|
private ArrayList<Long> pointEvents = new ArrayList<Long>();
|
||||||
|
|
||||||
|
public Event(GenomeLoc loc, long furthestStopPos, EVENT_TYPE type) {
|
||||||
|
this.loc = loc;
|
||||||
|
this.furthestStopPos = furthestStopPos;
|
||||||
|
this.type = type;
|
||||||
|
|
||||||
|
if ( type == EVENT_TYPE.INDEL_EVENT || type == EVENT_TYPE.BOTH ) {
|
||||||
|
eventStartPos = loc.getStart();
|
||||||
|
eventStopPos = loc.getStop();
|
||||||
|
} else {
|
||||||
|
eventStartPos = -1;
|
||||||
|
eventStopPos = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( type == EVENT_TYPE.POINT_EVENT || type == EVENT_TYPE.BOTH ) {
|
||||||
|
pointEvents.add(loc.getStart());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void merge(Event e) {
|
||||||
|
|
||||||
|
// merges only get called for events with certain types
|
||||||
|
if ( e.type == EVENT_TYPE.INDEL_EVENT || e.type == EVENT_TYPE.BOTH ) {
|
||||||
|
if ( eventStartPos == -1 )
|
||||||
|
eventStartPos = e.eventStartPos;
|
||||||
|
eventStopPos = e.eventStopPos;
|
||||||
|
furthestStopPos = e.furthestStopPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( e.type == EVENT_TYPE.POINT_EVENT || e.type == EVENT_TYPE.BOTH ) {
|
||||||
|
long newPosition = e.pointEvents.get(0);
|
||||||
|
if ( pointEvents.size() > 0 ) {
|
||||||
|
long lastPosition = pointEvents.get(pointEvents.size()-1);
|
||||||
|
if ( newPosition - lastPosition < windowSize ) {
|
||||||
|
eventStopPos = Math.max(eventStopPos, newPosition);
|
||||||
|
furthestStopPos = e.furthestStopPos;
|
||||||
|
|
||||||
|
if ( eventStartPos == -1 )
|
||||||
|
eventStartPos = lastPosition;
|
||||||
|
else
|
||||||
|
eventStartPos = Math.min(eventStartPos, lastPosition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pointEvents.add(newPosition);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isReportableEvent() {
|
||||||
|
return eventStartPos >= 0 && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s:%d-%d", loc.getContig(), eventStartPos, eventStopPos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue