Start getting the cleaner working in Walker

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@561 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-04-29 14:59:53 +00:00
parent 4c5f640eb7
commit 7de5da7065
3 changed files with 75 additions and 23 deletions

View File

@ -99,28 +99,41 @@ public class TraverseByLocusWindows extends TraversalEngine {
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
boolean done = false;
long leftmostIndex = interval.getStart(),
rightmostIndex = interval.getStop();
while (filterIter.hasNext() && !done) {
TraversalStatistics.nRecords++;
SAMRecord read = filterIter.next();
reads.add(read);
offsets.add((int)(read.getAlignmentStart() - interval.getStart()));
if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) {
if ( read.getAlignmentStart() < leftmostIndex )
leftmostIndex = read.getAlignmentStart();
if ( read.getAlignmentEnd() > rightmostIndex )
rightmostIndex = read.getAlignmentEnd();
if ( this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads ) {
logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
done = true;
}
}
LocusContext locus = new LocusContext(interval, reads, offsets);
GenomeLoc window = new GenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex);
LocusContext locus = new LocusContext(window, reads, offsets);
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
ReferenceIterator refSite = refIter.seekForward(locus.getLocation());
ReferenceIterator refSite = refIter.seekForward(window);
StringBuffer refBases = new StringBuffer(refSite.getBaseAsString());
int locusLength = (int)(rightmostIndex - leftmostIndex);
for ( int i = 0; i < locusLength; i++ ) {
refSite = refSite.next();
refBases.append(refSite.getBaseAsChar());
}
locus.setReferenceContig(refSite.getCurrentContig());
// Iterate forward to get all reference ordered data covering this interval
final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(locus.getLocation());
sum = walkAtinterval( walker, sum, locus, refSite, tracker );
sum = walkAtinterval( walker, sum, locus, refBases.toString(), tracker );
//System.out.format("Working at %s\n", locus.getLocation().toString());
@ -132,24 +145,17 @@ public class TraverseByLocusWindows extends TraversalEngine {
protected <M, T> T walkAtinterval( final LocusWindowWalker<M, T> walker,
T sum,
final LocusContext locus,
final ReferenceIterator refSite,
final String refSeq,
final RefMetaDataTracker tracker ) {
ReferenceIterator refSiteCopy = refSite;
StringBuffer refBases = new StringBuffer(refSiteCopy.getBaseAsString());
int locusLength = (int)(locus.getLocation().getStop() - locus.getLocation().getStart());
for ( int i = 0; i < locusLength; i++ ) {
refSiteCopy = refSiteCopy.next();
refBases.append(refSiteCopy.getBaseAsChar());
}
//logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase));
//
// Execute our contract with the walker. Call filter, map, and reduce
//
final boolean keepMeP = walker.filter(tracker, refBases.toString(), locus);
final boolean keepMeP = walker.filter(tracker, refSeq, locus);
if (keepMeP) {
M x = walker.map(tracker, refBases.toString(), locus);
M x = walker.map(tracker, refSeq, locus);
sum = walker.reduce(x, sum);
}

View File

@ -6,18 +6,41 @@ import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.indels.*;
import org.broadinstitute.sting.playground.utils.CountedObject;
import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter;
import net.sf.samtools.*;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;
import java.io.File;
@WalkerName("IntervalCleaner")
public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer> {
@Argument(fullName="maxReadLength", shortName="maxRead", required=false, defaultValue="-1")
public int maxReadLength;
@Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads")
public String OUT;
public void initialize() {}
private SAMFileWriter writer;
public void initialize() {
SAMFileHeader header = getToolkit().getSamReader().getFileHeader();
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
}
public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) {
out.println(context.getLocation() + " (" + context.getReads().size() + ") => " + ref);
List<SAMRecord> reads = context.getReads();
ArrayList<SAMRecord> goodReads = new ArrayList<SAMRecord>();
long leftmostIndex = context.getLocation().getStart();
for ( SAMRecord read : reads ) {
if ( read.getReadLength() <= maxReadLength )
goodReads.add(read);
}
clean(goodReads, ref, context.getLocation().getStart());
return 1;
}
@ -32,4 +55,21 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer> {
public void onTraversalDone(Integer result) {
out.println("Saw " + result + " intervals");
}
private void clean(List<SAMRecord> reads, String reference, long leftmostIndex) {
// total mismatches across all reads
//int totalMismatches = 0;
//TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >(
// new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));
for ( SAMRecord read : reads ) {
System.out.println(read.getReadString());
System.out.println(reference.substring(read.getAlignmentStart()-(int)leftmostIndex, read.getAlignmentEnd()-(int)leftmostIndex+1));
//totalMismatches += AlignmentUtils.numMismatches(read, reference);
//System.out.println(totalMismatches + "\n");
}
}
}

View File

@ -130,6 +130,19 @@ public class PileBuilder implements RecordPileReceiver {
public int getNumReadsWritten() { return total_reads_written; }
public void receive(Collection<SAMRecord> c) {
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end)
for ( SAMRecord r : c ) {
startOnRef = Math.min(startOnRef, r.getAlignmentStart() );
stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd());
}
// part of the reference covered by original alignments:
String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef);
receive(c, pileRef, startOnRef);
}
public void receive(Collection<SAMRecord> c, String pileRef, int startOnRef) {
//TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup.
processed_piles++;
@ -137,17 +150,10 @@ public class PileBuilder implements RecordPileReceiver {
IndexedSequence[] seqs = new IndexedSequence[c.size()];
int i = 0;
int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref
int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end)
for ( SAMRecord r : c ) {
seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize);
startOnRef = Math.min(startOnRef, r.getAlignmentStart() );
stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd());
}
// part of the reference covered by original alignments:
String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef);
int totalMismatches = 0; // total mismatches across all reads
TreeSet< CountedObject<Indel> > all_indels = new TreeSet< CountedObject<Indel> >(
new CountedObjectComparatorAdapter<Indel>(new IntervalComparator()));