1. Merge LocusWindows whose reads overlap.
2. Fix bug (we weren't clearing the "to emit" list) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@670 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9f942fdfa0
commit
630066cc0a
|
|
@ -9,15 +9,13 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.iterators.ReferenceIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.filter.FilteringIterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -85,21 +83,33 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
}
|
||||
|
||||
protected <M, T> T strictIntervalTraversal(LocusWindowWalker<M, T> walker, List<GenomeLoc> locations, T sum) {
|
||||
LocusContext nextLocusToCarry = null;
|
||||
for ( GenomeLoc interval : locations ) {
|
||||
logger.debug(String.format("Processing interval %s", interval.toString()));
|
||||
|
||||
CloseableIterator<SAMRecord> readIter = samReader.queryOverlapping( interval.getContig(),
|
||||
(int)interval.getStart(),
|
||||
(int)interval.getStop());
|
||||
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator( readIter, false );
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, interval);
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(readIter, false);
|
||||
LocusContext locus = getLocusContext(wrappedIter, interval);
|
||||
readIter.close();
|
||||
|
||||
if ( nextLocusToCarry == null ) {
|
||||
nextLocusToCarry = locus;
|
||||
} else if ( nextLocusToCarry.getLocation().overlapsP(locus.getLocation()) ) {
|
||||
nextLocusToCarry = merge(nextLocusToCarry, locus);
|
||||
} else {
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLocusToCarry);
|
||||
nextLocusToCarry = locus;
|
||||
}
|
||||
}
|
||||
if ( nextLocusToCarry != null )
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLocusToCarry);
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T fullInputTraversal(LocusWindowWalker<M, T> walker, List<GenomeLoc> locations, T sum) {
|
||||
ArrayList<LocusContext> nextLociToCarry = new ArrayList<LocusContext>();
|
||||
|
||||
// set everything up
|
||||
GenomeLoc currentInterval = (locations.size() > 0 ? locations.get(0) : null);
|
||||
|
|
@ -113,6 +123,12 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
|
||||
// if there are no locations or we're past the last one or it's unmapped, then act on the read separately
|
||||
if ( currentInterval == null || read.getReadUnmappedFlag() ) {
|
||||
if ( nextLociToCarry.size() > 0 ) {
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLociToCarry.get(0));
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
walker.nonIntervalReadAction(nextLociToCarry.get(i).getReads().get(0));
|
||||
nextLociToCarry.clear();
|
||||
}
|
||||
walker.nonIntervalReadAction(read);
|
||||
}
|
||||
else {
|
||||
|
|
@ -123,36 +139,82 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
}
|
||||
// if we're not yet in the interval, act on the read separately
|
||||
else if ( currentInterval.isPast(loc) ) {
|
||||
walker.nonIntervalReadAction(read);
|
||||
if ( nextLociToCarry.size() == 0 ) {
|
||||
walker.nonIntervalReadAction(read);
|
||||
} else {
|
||||
ArrayList<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
list.add(read);
|
||||
nextLociToCarry.add(new LocusContext(loc, list, null));
|
||||
}
|
||||
}
|
||||
// otherwise, we're past the interval so first deal with the collected reads and then this one
|
||||
else {
|
||||
if ( intervalReads.size() > 0 ) {
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
|
||||
LocusContext locus = getLocusContext(wrappedIter, currentInterval);
|
||||
|
||||
if ( nextLociToCarry.size() == 0 ) {
|
||||
nextLociToCarry.add(locus);
|
||||
} else if ( nextLociToCarry.get(0).getLocation().overlapsP(locus.getLocation()) ) {
|
||||
LocusContext newLocus = merge(nextLociToCarry.get(0), locus);
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
newLocus = merge(newLocus, nextLociToCarry.get(i));
|
||||
nextLociToCarry.clear();
|
||||
nextLociToCarry.add(newLocus);
|
||||
} else {
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLociToCarry.get(0));
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
walker.nonIntervalReadAction(nextLociToCarry.get(i).getReads().get(0));
|
||||
nextLociToCarry.clear();
|
||||
nextLociToCarry.add(locus);
|
||||
}
|
||||
|
||||
// then prepare for the next interval
|
||||
intervalReads.clear();
|
||||
currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null);
|
||||
}
|
||||
walker.nonIntervalReadAction(read);
|
||||
currentInterval = (++locationsIndex < locations.size() ? locations.get(locationsIndex) : null);
|
||||
|
||||
if ( nextLociToCarry.size() == 0 ) {
|
||||
walker.nonIntervalReadAction(read);
|
||||
} else {
|
||||
ArrayList<SAMRecord> list = new ArrayList<SAMRecord>();
|
||||
list.add(read);
|
||||
nextLociToCarry.add(new LocusContext(loc, list, null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// some cleanup
|
||||
if ( intervalReads.size() > 0 ) {
|
||||
Iterator<SAMRecord> wrappedIter = WrapReadsIterator(intervalReads.iterator(), false);
|
||||
sum = carryWalkerOverInterval(walker, wrappedIter, sum, currentInterval);
|
||||
LocusContext locus = getLocusContext(wrappedIter, currentInterval);
|
||||
if ( nextLociToCarry.size() == 0 ) {
|
||||
nextLociToCarry.add(locus);
|
||||
} else if ( nextLociToCarry.get(0).getLocation().overlapsP(locus.getLocation()) ) {
|
||||
LocusContext newLocus = merge(nextLociToCarry.get(0), locus);
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
newLocus = merge(newLocus, nextLociToCarry.get(i));
|
||||
nextLociToCarry.clear();
|
||||
nextLociToCarry.add(newLocus);
|
||||
} else {
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLociToCarry.get(0));
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
walker.nonIntervalReadAction(nextLociToCarry.get(i).getReads().get(0));
|
||||
nextLociToCarry.clear();
|
||||
nextLociToCarry.add(locus);
|
||||
}
|
||||
}
|
||||
if ( nextLociToCarry.size() > 0 ) {
|
||||
sum = carryWalkerOverInterval(walker, sum, nextLociToCarry.get(0));
|
||||
for (int i=1; i < nextLociToCarry.size(); i++)
|
||||
walker.nonIntervalReadAction(nextLociToCarry.get(i).getReads().get(0));
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval(LocusWindowWalker<M, T> walker, Iterator<SAMRecord> readIter, T sum, GenomeLoc interval ) {
|
||||
logger.debug(String.format("TraverseByIntervals.carryWalkerOverInterval Genomic interval is %s", interval));
|
||||
|
||||
private LocusContext getLocusContext(Iterator<SAMRecord> readIter, GenomeLoc interval) {
|
||||
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
|
||||
ArrayList<Integer> offsets = new ArrayList<Integer>();
|
||||
boolean done = false;
|
||||
long leftmostIndex = interval.getStart(),
|
||||
rightmostIndex = interval.getStop();
|
||||
|
|
@ -160,7 +222,6 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
TraversalStatistics.nRecords++;
|
||||
SAMRecord read = readIter.next();
|
||||
reads.add(read);
|
||||
offsets.add((int)(read.getAlignmentStart() - interval.getStart()));
|
||||
if ( read.getAlignmentStart() < leftmostIndex )
|
||||
leftmostIndex = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > rightmostIndex )
|
||||
|
|
@ -172,27 +233,49 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
}
|
||||
|
||||
GenomeLoc window = new GenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex);
|
||||
LocusContext locus = new LocusContext(window, reads, offsets);
|
||||
LocusContext locus = new LocusContext(window, reads, null);
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
|
||||
ReferenceIterator refSite = refIter.seekForward(window);
|
||||
return locus;
|
||||
}
|
||||
|
||||
private LocusContext merge(LocusContext locus1, LocusContext locus2) {
|
||||
GenomeLoc loc = locus1.getLocation().merge(locus2.getLocation());
|
||||
TreeSet<SAMRecord> set = new TreeSet<SAMRecord>(new Comparator<SAMRecord>() {
|
||||
public int compare(SAMRecord obj1, SAMRecord obj2) {
|
||||
GenomeLoc myLoc = new GenomeLoc(obj1);
|
||||
GenomeLoc hisLoc = new GenomeLoc(obj2);
|
||||
int comparison = myLoc.compareTo(hisLoc);
|
||||
// if the reads have the same start position, we must give a non-zero comparison
|
||||
// (because java Sets often require "consistency with equals")
|
||||
if ( comparison == 0 )
|
||||
comparison = obj1.getReadName().compareTo(obj2.getReadName());
|
||||
return comparison;
|
||||
}
|
||||
});
|
||||
|
||||
set.addAll(locus1.getReads());
|
||||
set.addAll(locus2.getReads());
|
||||
return new LocusContext(loc, new ArrayList<SAMRecord>(set), null);
|
||||
}
|
||||
|
||||
protected <M, T> T carryWalkerOverInterval(LocusWindowWalker<M, T> walker, T sum, LocusContext window) {
|
||||
ReferenceIterator refSite = refIter.seekForward(window.getLocation());
|
||||
StringBuffer refBases = new StringBuffer(refSite.getBaseAsString());
|
||||
int locusLength = (int)(rightmostIndex - leftmostIndex);
|
||||
int locusLength = (int)(window.getLocation().getStop() - window.getLocation().getStart());
|
||||
for ( int i = 0; i < locusLength; i++ ) {
|
||||
refSite = refSite.next();
|
||||
refBases.append(refSite.getBaseAsChar());
|
||||
}
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
window.setReferenceContig(refSite.getCurrentContig());
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this interval
|
||||
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(locus.getLocation());
|
||||
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(window.getLocation());
|
||||
|
||||
sum = walkAtinterval( walker, sum, locus, refBases.toString(), tracker );
|
||||
sum = walkAtinterval( walker, sum, window, refBases.toString(), tracker );
|
||||
|
||||
//System.out.format("Working at %s\n", locus.getLocation().toString());
|
||||
|
||||
printProgress("intervals", locus.getLocation());
|
||||
printProgress("intervals", window.getLocation());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
|
@ -203,8 +286,6 @@ public class TraverseByLocusWindows extends TraversalEngine {
|
|||
final String refSeq,
|
||||
final RefMetaDataTracker tracker ) {
|
||||
|
||||
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
|
||||
|
||||
//
|
||||
// Execute our contract with the walker. Call filter, map, and reduce
|
||||
//
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval
|
|||
long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1;
|
||||
|
||||
GenomeLoc indelLoc = new GenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge);
|
||||
GenomeLoc refLoc = new GenomeLoc(read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
GenomeLoc refLoc = new GenomeLoc(read);
|
||||
|
||||
return new Interval(refLoc, indelLoc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -59,7 +59,6 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
else
|
||||
readsToWrite.add(new ComparableSAMRecord(read));
|
||||
}
|
||||
|
||||
clean(goodReads, ref, context.getLocation().getStart());
|
||||
//bruteForceClean(goodReads, ref, context.getLocation().getStart());
|
||||
//testCleanWithDeletion();
|
||||
|
|
@ -68,6 +67,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
Iterator<ComparableSAMRecord> iter = readsToWrite.iterator();
|
||||
while ( iter.hasNext() )
|
||||
writer.addAlignment(iter.next().getRecord());
|
||||
readsToWrite.clear();
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -133,7 +133,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
AlignedRead aRead = altReads.get(index);
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
|
||||
int refIdx = swConsensus.getAlignmentStart2wrt1();
|
||||
if ( refIdx < 0 || refIdx > reference.length() - aRead.getReadString().length() )
|
||||
if ( refIdx < 0 )
|
||||
continue;
|
||||
|
||||
// create the new consensus
|
||||
|
|
@ -143,6 +143,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
|
||||
int indelCount = 0;
|
||||
int altIdx = 0;
|
||||
boolean ok_flag = true;
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator() ) {
|
||||
|
|
@ -151,7 +152,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
refIdx += ce.getLength();
|
||||
break;
|
||||
case M:
|
||||
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
|
||||
if ( reference.length() < refIdx+ce.getLength() )
|
||||
ok_flag = false;
|
||||
else
|
||||
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
|
||||
refIdx += ce.getLength();
|
||||
altIdx += ce.getLength();
|
||||
break;
|
||||
|
|
@ -162,8 +166,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
|
|||
break;
|
||||
}
|
||||
}
|
||||
// make sure that there is at most only a single indel!
|
||||
if ( indelCount > 1 )
|
||||
// make sure that there is at most only a single indel and it aligns appropriately!
|
||||
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
|
||||
continue;
|
||||
|
||||
sb.append(reference.substring(refIdx));
|
||||
|
|
|
|||
Loading…
Reference in New Issue