From 78890c0beecf0e911246ede573d6a3c0c9e691b1 Mon Sep 17 00:00:00 2001 From: ebanks Date: Sun, 24 Jan 2010 05:32:38 +0000 Subject: [PATCH] First version of walker that combines the functionality of IndelIntervalWalker, MismatchIntervalWalker, SNPClusterWalker, and IntervalMergerWalker - plus it allows the user to input rods containing known indels (e.g. dbSNP or 1KG calls) for automatic cleaning. Basically, all pre-processing steps for cleaning are now done in a single pass. More testing needed. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2672 348d0f76-0448-11de-a6fe-93d51630548a --- .../indels/RealignerTargetCreator.java | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java new file mode 100755 index 000000000..df630a16c --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -0,0 +1,230 @@ +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.Platform454Filter; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + +/** + * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads. + */ +@ReadFilters({Platform454Filter.class, ZeroMappingQualityReadFilter.class}) +public class RealignerTargetCreator extends LocusWalker { + + // mismatch/entropy arguments + @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) + protected int windowSize = 10; + + @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false) + protected double mismatchThreshold = 0.15; + + + // observed indels arguments + @Argument(fullName="minIndelsPerInterval", shortName="minIndels", doc="min indels per interval", required=false) + int minIntervalIndelCount = 1; + + + // interval merging arguments + @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="max interval size", required=false) + int maxIntervalSize = 500; + + + private final int minReadsAtInterval = 4; + + @Override + public boolean generateExtendedEvents() { return true; } + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + + public void initialize() { + if ( windowSize < 2 ) + throw new StingException("Window Size must be an integer greater than 1"); + } + + public Event map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + boolean hasIndel = false; + boolean hasInsertion = false; + boolean hasPointEvent = false; + + long furthestStopPos = -1; + + // look for insertions in the extended context (we'll get deletions from the normal context) + if ( context.hasExtendedEventPileup() ) { + ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup(); + if ( pileup.getNumberOfInsertions() > 0 ) { + hasIndel = hasInsertion = true; + // check the ends of the reads to see how far they extend + for (ExtendedEventPileupElement p : pileup ) + furthestStopPos = Math.max(furthestStopPos, p.getRead().getAlignmentEnd()); + } + } + + // look at the rods for indels or SNPs + if ( tracker != null ) { + Iterator rods = tracker.getAllRods().iterator(); + while ( rods.hasNext() ) { + ReferenceOrderedDatum rod = rods.next(); + if ( rod instanceof VariationRod ) { + if ( ((VariationRod)rod).isIndel() ) { + hasIndel = true; + if ( ((VariationRod)rod).isInsertion() ) + hasInsertion = true; + } + if ( ((VariationRod)rod).isSNP() ) + hasPointEvent = true; + } + } + } + + // look at the normal context to get deletions and positions with high entropy + ReadBackedPileup pileup = context.getBasePileup(); + if ( pileup != null ) { + + int mismatchQualities = 0, totalQualities = 0; + char upperRef = Character.toUpperCase(ref.getBase()); + for (PileupElement p : pileup ) { + // check the ends of the reads to see how far they extend + SAMRecord read = p.getRead(); + furthestStopPos = Math.max(furthestStopPos, read.getAlignmentEnd()); + + // is it a deletion? (sanity check in case extended event missed it) + if ( p.isDeletion() ) { + hasIndel = true; + } + + // look for mismatches + else { + if ( Character.toUpperCase(p.getBase()) != upperRef ) + mismatchQualities += p.getQual(); + totalQualities += p.getQual(); + } + } + + // make sure we're supposed to look for high entropy + if ( mismatchThreshold > 0.0 && + mismatchThreshold <= 1.0 && + pileup.size() >= minReadsAtInterval && + (double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) + hasPointEvent = true; + } + + if ( !hasIndel && !hasPointEvent ) + return null; + + GenomeLoc eventLoc = context.getLocation(); + if ( hasInsertion ) + eventLoc = GenomeLocParser.createGenomeLoc(eventLoc.getContigIndex(), eventLoc.getStart(), eventLoc.getStart()+1); + + EVENT_TYPE eventType = (hasIndel ? (hasPointEvent ? EVENT_TYPE.BOTH : EVENT_TYPE.INDEL_EVENT) : EVENT_TYPE.POINT_EVENT); + + return new Event(eventLoc, furthestStopPos, eventType); + } + + public void onTraversalDone(Event sum) { + if ( sum != null && sum.isReportableEvent() ) + out.println(sum.toString()); + } + + public Event reduceInit() { + return null; + } + + public Event reduce(Event value, Event sum) { + // ignore no new events + if ( value == null ) + return sum; + + // if it's the first good value, use it + if ( sum == null ) + return value; + + // if we hit a new contig or they have no overlapping reads, then they are separate events - so clear sum + if ( sum.loc.getContigIndex() != value.loc.getContigIndex() || sum.furthestStopPos < value.loc.getStart() ) { + if ( sum.isReportableEvent() ) + out.println(sum.toString()); + return value; + } + + // otherwise, merge the two events + sum.merge(value); + return sum; + } + + private enum EVENT_TYPE { POINT_EVENT, INDEL_EVENT, BOTH } + + class Event { + public long furthestStopPos; + + public GenomeLoc loc; + public long eventStartPos; + private long eventStopPos; + private EVENT_TYPE type; + private ArrayList pointEvents = new ArrayList(); + + public Event(GenomeLoc loc, long furthestStopPos, EVENT_TYPE type) { + this.loc = loc; + this.furthestStopPos = furthestStopPos; + this.type = type; + + if ( type == EVENT_TYPE.INDEL_EVENT || type == EVENT_TYPE.BOTH ) { + eventStartPos = loc.getStart(); + eventStopPos = loc.getStop(); + } else { + eventStartPos = -1; + eventStopPos = -1; + } + + if ( type == EVENT_TYPE.POINT_EVENT || type == EVENT_TYPE.BOTH ) { + pointEvents.add(loc.getStart()); + } + } + + public void merge(Event e) { + + // merges only get called for events with certain types + if ( e.type == EVENT_TYPE.INDEL_EVENT || e.type == EVENT_TYPE.BOTH ) { + if ( eventStartPos == -1 ) + eventStartPos = e.eventStartPos; + eventStopPos = e.eventStopPos; + furthestStopPos = e.furthestStopPos; + } + + if ( e.type == EVENT_TYPE.POINT_EVENT || e.type == EVENT_TYPE.BOTH ) { + long newPosition = e.pointEvents.get(0); + if ( pointEvents.size() > 0 ) { + long lastPosition = pointEvents.get(pointEvents.size()-1); + if ( newPosition - lastPosition < windowSize ) { + eventStopPos = Math.max(eventStopPos, newPosition); + furthestStopPos = e.furthestStopPos; + + if ( eventStartPos == -1 ) + eventStartPos = lastPosition; + else + eventStartPos = Math.min(eventStartPos, lastPosition); + } + } + pointEvents.add(newPosition); + } + } + + public boolean isReportableEvent() { + return eventStartPos >= 0 && eventStopPos >= 0 && eventStopPos - eventStartPos < maxIntervalSize; + } + + public String toString() { + return String.format("%s:%d-%d", loc.getContig(), eventStartPos, eventStopPos); + } + } +} \ No newline at end of file