From 009e71fcd97351f5168a238a8ccb5e93d00a2b74 Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 11 May 2009 15:43:42 +0000 Subject: [PATCH] We need to sort cleaned reads ourselves (instead of letting SAMFileWriter do it) because the SAM headers are often screwed up and claim to be "unsorted". While here, I broke off the module from the SortSamIterator in case someone else wants to use it. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@654 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/iterators/SortSamIterator.java | 35 ++--------------- .../gatk/walkers/IntervalCleanerWalker.java | 38 +++++++++++-------- .../sting/utils/ComparableSAMRecord.java | 28 ++++++++++++++ 3 files changed, 54 insertions(+), 47 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/utils/ComparableSAMRecord.java diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java index 68d46ba5a..575ce77af 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java @@ -1,21 +1,13 @@ package org.broadinstitute.sting.gatk.iterators; +import org.broadinstitute.sting.utils.ComparableSAMRecord; + import net.sf.samtools.SAMRecord; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Mar 15, 2009 - * Time: 6:02:31 PM - * To change this template use File | Settings | File Templates. - */ public class SortSamIterator implements Iterator { Iterator it; @@ -25,9 +17,9 @@ public class SortSamIterator implements Iterator { ArrayList list = new ArrayList(); while (unsortedIter.hasNext()) { list.add(new ComparableSAMRecord(unsortedIter.next())); - // choose an arbitrary length to limit sorting for now + // limit how much can be sorted for now if (list.size() > maxSorts) - throw new UnsupportedOperationException("Can not sort files with more than 100K reads on the fly!"); + throw new UnsupportedOperationException("Can not sort files with more than " + maxSorts + " reads on the fly!"); } Collections.sort(list); it = list.iterator(); @@ -39,23 +31,4 @@ public class SortSamIterator implements Iterator { public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - - private class ComparableSAMRecord implements Comparable { - - private SAMRecord record; - - public ComparableSAMRecord(SAMRecord record) { - this.record = record; - } - - public SAMRecord getRecord() { - return record; - } - - public int compareTo(ComparableSAMRecord o) { - GenomeLoc myLoc = new GenomeLoc(record); - GenomeLoc hisLoc = new GenomeLoc(o.getRecord()); - return myLoc.compareTo(hisLoc); - } - } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java index 9e5e696ae..86e80bfbc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/IntervalCleanerWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.ComparableSAMRecord; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; @@ -11,8 +12,7 @@ import org.broadinstitute.sting.playground.indels.*; import net.sf.samtools.*; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.io.File; @WalkerName("IntervalCleaner") @@ -27,10 +27,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker public static final int MAX_QUAL = 99; private SAMFileWriter writer; + // we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes + private TreeSet readsToWrite; public void initialize() { SAMFileHeader header = getToolkit().getSamReader().getFileHeader(); writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT)); + readsToWrite = new TreeSet(); } // do we care about reads that are not part of our intervals? @@ -54,7 +57,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) goodReads.add(read); else - writer.addAlignment(read); + readsToWrite.add(new ComparableSAMRecord(read)); } clean(goodReads, ref, context.getLocation().getStart()); @@ -62,6 +65,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker //testCleanWithDeletion(); //testCleanWithInsertion(); + Iterator iter = readsToWrite.iterator(); + while ( iter.hasNext() ) + writer.addAlignment(iter.next().getRecord()); return 1; } @@ -198,13 +204,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker // clean the appropriate reads for ( Pair indexPair : bestConsensus.readIndexes ) updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), altReads.get(indexPair.getFirst()), (int)leftmostIndex); - - // write them out - for ( SAMRecord rec : refReads ) - writer.addAlignment(rec); - for ( AlignedRead aRec : altReads ) - writer.addAlignment(aRec.getRead()); } + + // write them out + for ( SAMRecord rec : refReads ) + readsToWrite.add(new ComparableSAMRecord(rec)); + for ( AlignedRead aRec : altReads ) + readsToWrite.add(new ComparableSAMRecord(aRec.getRead())); } private Pair findBestOffset(String ref, AlignedRead read) { @@ -540,18 +546,18 @@ public class IntervalCleanerWalker extends LocusWindowWalker } // if the best alternate consensus has a smaller sum of quality score mismatches, then clean! - if ( bestConsensus.mismatchSum < totalMismatchSum ) { + if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) { logger.info("CLEAN: " + bestConsensus.str); // clean the appropriate reads for ( Pair indexPair : bestConsensus.readIndexes ) updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), altReads.get(indexPair.getFirst()), (int)leftmostIndex); - - // write them out - for ( SAMRecord rec : refReads ) - writer.addAlignment(rec); - for ( AlignedRead aRec : altReads ) - writer.addAlignment(aRec.getRead()); } + + // write them out + for ( SAMRecord rec : refReads ) + readsToWrite.add(new ComparableSAMRecord(rec)); + for ( AlignedRead aRec : altReads ) + readsToWrite.add(new ComparableSAMRecord(aRec.getRead())); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/ComparableSAMRecord.java b/java/src/org/broadinstitute/sting/utils/ComparableSAMRecord.java new file mode 100755 index 000000000..9e9716b5e --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ComparableSAMRecord.java @@ -0,0 +1,28 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMRecord; + +public class ComparableSAMRecord implements Comparable { + + private SAMRecord record; + + public ComparableSAMRecord(SAMRecord record) { + this.record = record; + } + + public SAMRecord getRecord() { + return record; + } + + public int compareTo(ComparableSAMRecord o) { + // first sort by start position + GenomeLoc myLoc = new GenomeLoc(record); + GenomeLoc hisLoc = new GenomeLoc(o.getRecord()); + int comparison = myLoc.compareTo(hisLoc); + // if the reads have the same start position, we must give a non-zero comparison + // (because java Sets often require "consistency with equals") + if ( comparison == 0 ) + comparison = record.getReadName().compareTo(o.getRecord().getReadName()); + return comparison; + } +} \ No newline at end of file