From 7de5da7065db7e3bc90db1b5f2c1bc4f8cffb406 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 29 Apr 2009 14:59:53 +0000 Subject: [PATCH] Start getting the cleaner working in Walker git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@561 348d0f76-0448-11de-a6fe-93d51630548a --- .../traversals/TraverseByLocusWindows.java | 34 ++++++++------ .../gatk/walkers/IntervalCleanerWalker.java | 44 ++++++++++++++++++- .../sting/playground/indels/PileBuilder.java | 20 ++++++--- 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java index b38fe24f4..cd87d17c4 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java @@ -99,28 +99,41 @@ public class TraverseByLocusWindows extends TraversalEngine { ArrayList reads = new ArrayList(); ArrayList offsets = new ArrayList(); boolean done = false; + long leftmostIndex = interval.getStart(), + rightmostIndex = interval.getStop(); while (filterIter.hasNext() && !done) { TraversalStatistics.nRecords++; SAMRecord read = filterIter.next(); reads.add(read); offsets.add((int)(read.getAlignmentStart() - interval.getStart())); - if (this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads) { + if ( read.getAlignmentStart() < leftmostIndex ) + leftmostIndex = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > rightmostIndex ) + rightmostIndex = read.getAlignmentEnd(); + if ( this.maxReads > 0 && TraversalStatistics.nRecords > this.maxReads ) { logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); done = true; } } - LocusContext locus = new LocusContext(interval, reads, offsets); + GenomeLoc window = new GenomeLoc(interval.getContig(), leftmostIndex, rightmostIndex); + LocusContext locus = new LocusContext(window, reads, offsets); if ( DOWNSAMPLE_BY_COVERAGE ) locus.downsampleToCoverage(downsamplingCoverage); - ReferenceIterator refSite = refIter.seekForward(locus.getLocation()); + ReferenceIterator refSite = refIter.seekForward(window); + StringBuffer refBases = new StringBuffer(refSite.getBaseAsString()); + int locusLength = (int)(rightmostIndex - leftmostIndex); + for ( int i = 0; i < locusLength; i++ ) { + refSite = refSite.next(); + refBases.append(refSite.getBaseAsChar()); + } locus.setReferenceContig(refSite.getCurrentContig()); // Iterate forward to get all reference ordered data covering this interval final RefMetaDataTracker tracker = getReferenceOrderedDataAtLocus(locus.getLocation()); - sum = walkAtinterval( walker, sum, locus, refSite, tracker ); + sum = walkAtinterval( walker, sum, locus, refBases.toString(), tracker ); //System.out.format("Working at %s\n", locus.getLocation().toString()); @@ -132,24 +145,17 @@ public class TraverseByLocusWindows extends TraversalEngine { protected T walkAtinterval( final LocusWindowWalker walker, T sum, final LocusContext locus, - final ReferenceIterator refSite, + final String refSeq, final RefMetaDataTracker tracker ) { - ReferenceIterator refSiteCopy = refSite; - StringBuffer refBases = new StringBuffer(refSiteCopy.getBaseAsString()); - int locusLength = (int)(locus.getLocation().getStop() - locus.getLocation().getStart()); - for ( int i = 0; i < locusLength; i++ ) { - refSiteCopy = refSiteCopy.next(); - refBases.append(refSiteCopy.getBaseAsChar()); - } //logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); // // Execute our contract with the walker. Call filter, map, and reduce // - final boolean keepMeP = walker.filter(tracker, refBases.toString(), locus); + final boolean keepMeP = walker.filter(tracker, refSeq, locus); if (keepMeP) { - M x = walker.map(tracker, refBases.toString(), locus); + M x = walker.map(tracker, refSeq, locus); sum = walker.reduce(x, sum); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java index 04a41ede7..07f3d7851 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -6,18 +6,41 @@ import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.playground.indels.*; +import org.broadinstitute.sting.playground.utils.CountedObject; +import org.broadinstitute.sting.playground.utils.CountedObjectComparatorAdapter; import net.sf.samtools.*; +import java.util.ArrayList; +import java.util.List; +import java.util.TreeSet; +import java.io.File; + @WalkerName("IntervalCleaner") public class IntervalCleanerWalker extends LocusWindowWalker { @Argument(fullName="maxReadLength", shortName="maxRead", required=false, defaultValue="-1") public int maxReadLength; + @Argument(fullName="OutputCleaned", shortName="O", required=true, doc="Output file (sam or bam) for improved (realigned) reads") + public String OUT; - public void initialize() {} + private SAMFileWriter writer; + + public void initialize() { + SAMFileHeader header = getToolkit().getSamReader().getFileHeader(); + writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); + } public Integer map(RefMetaDataTracker tracker, String ref, LocusContext context) { - out.println(context.getLocation() + " (" + context.getReads().size() + ") => " + ref); + List reads = context.getReads(); + ArrayList goodReads = new ArrayList(); + long leftmostIndex = context.getLocation().getStart(); + for ( SAMRecord read : reads ) { + if ( read.getReadLength() <= maxReadLength ) + goodReads.add(read); + } + clean(goodReads, ref, context.getLocation().getStart()); + return 1; } @@ -32,4 +55,21 @@ public class IntervalCleanerWalker extends LocusWindowWalker { public void onTraversalDone(Integer result) { out.println("Saw " + result + " intervals"); } + + private void clean(List reads, String reference, long leftmostIndex) { + // total mismatches across all reads + //int totalMismatches = 0; + //TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >( + // new CountedObjectComparatorAdapter(new IntervalComparator())); + + for ( SAMRecord read : reads ) { + System.out.println(read.getReadString()); + System.out.println(reference.substring(read.getAlignmentStart()-(int)leftmostIndex, read.getAlignmentEnd()-(int)leftmostIndex+1)); + //totalMismatches += AlignmentUtils.numMismatches(read, reference); + //System.out.println(totalMismatches + "\n"); + } + + + + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java index 0044dab98..0d55bdbe4 100755 --- a/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java +++ b/java/src/org/broadinstitute/sting/playground/indels/PileBuilder.java @@ -130,6 +130,19 @@ public class PileBuilder implements RecordPileReceiver { public int getNumReadsWritten() { return total_reads_written; } public void receive(Collection c) { + int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref + int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end) + for ( SAMRecord r : c ) { + startOnRef = Math.min(startOnRef, r.getAlignmentStart() ); + stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd()); + } + + // part of the reference covered by original alignments: + String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef); + receive(c, pileRef, startOnRef); + } + + public void receive(Collection c, String pileRef, int startOnRef) { //TODO: if read starts/ends with an indel (insertion, actually), we detect this as a "different" indel introduced during cleanup. processed_piles++; @@ -137,17 +150,10 @@ public class PileBuilder implements RecordPileReceiver { IndexedSequence[] seqs = new IndexedSequence[c.size()]; int i = 0; - int startOnRef = 1000000000; // absolute start (leftmost) position of the pile of reads on the ref - int stopOnRef = 0; // absolute stop (rightmost) position of the pile of reads on the ref (rightmost alignment end) for ( SAMRecord r : c ) { seqs[i++] = new IndexedSequence(r.getReadString(),KmerSize); - startOnRef = Math.min(startOnRef, r.getAlignmentStart() ); - stopOnRef = Math.max(stopOnRef,r.getAlignmentEnd()); } - // part of the reference covered by original alignments: - String pileRef = referenceSequence.substring(startOnRef-1,stopOnRef); - int totalMismatches = 0; // total mismatches across all reads TreeSet< CountedObject > all_indels = new TreeSet< CountedObject >( new CountedObjectComparatorAdapter(new IntervalComparator()));