We need to sort cleaned reads ourselves (instead of letting SAMFileWriter

do it) because the SAM headers are often screwed up and claim to be
"unsorted".  While here, I broke off the module from the SortSamIterator
in case someone else wants to use it.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@654 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-05-11 15:43:42 +00:00
parent c735e1f627
commit 009e71fcd9
3 changed files with 54 additions and 47 deletions

View File

@ -1,21 +1,13 @@
package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.ComparableSAMRecord;
import net.sf.samtools.SAMRecord;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Mar 15, 2009
* Time: 6:02:31 PM
* To change this template use File | Settings | File Templates.
*/
public class SortSamIterator implements Iterator<SAMRecord> {
Iterator<ComparableSAMRecord> it;
@ -25,9 +17,9 @@ public class SortSamIterator implements Iterator<SAMRecord> {
ArrayList<ComparableSAMRecord> list = new ArrayList<ComparableSAMRecord>();
while (unsortedIter.hasNext()) {
list.add(new ComparableSAMRecord(unsortedIter.next()));
// choose an arbitrary length to limit sorting for now
// limit how much can be sorted for now
if (list.size() > maxSorts)
throw new UnsupportedOperationException("Can not sort files with more than 100K reads on the fly!");
throw new UnsupportedOperationException("Can not sort files with more than " + maxSorts + " reads on the fly!");
}
Collections.sort(list);
it = list.iterator();
@ -39,23 +31,4 @@ public class SortSamIterator implements Iterator<SAMRecord> {
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
private class ComparableSAMRecord implements Comparable<ComparableSAMRecord> {
private SAMRecord record;
public ComparableSAMRecord(SAMRecord record) {
this.record = record;
}
public SAMRecord getRecord() {
return record;
}
public int compareTo(ComparableSAMRecord o) {
GenomeLoc myLoc = new GenomeLoc(record);
GenomeLoc hisLoc = new GenomeLoc(o.getRecord());
return myLoc.compareTo(hisLoc);
}
}
}

View File

@ -2,6 +2,7 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.ComparableSAMRecord;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.LocusWindowWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
@ -11,8 +12,7 @@ import org.broadinstitute.sting.playground.indels.*;
import net.sf.samtools.*;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
import java.io.File;
@WalkerName("IntervalCleaner")
@ -27,10 +27,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
public static final int MAX_QUAL = 99;
private SAMFileWriter writer;
// we need to sort the reads ourselves because SAM headers get messed up and claim to be "unsorted" sometimes
private TreeSet<ComparableSAMRecord> readsToWrite;
public void initialize() {
SAMFileHeader header = getToolkit().getSamReader().getFileHeader();
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, new File(OUT));
readsToWrite = new TreeSet<ComparableSAMRecord>();
}
// do we care about reads that are not part of our intervals?
@ -54,7 +57,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
goodReads.add(read);
else
writer.addAlignment(read);
readsToWrite.add(new ComparableSAMRecord(read));
}
clean(goodReads, ref, context.getLocation().getStart());
@ -62,6 +65,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
//testCleanWithDeletion();
//testCleanWithInsertion();
Iterator<ComparableSAMRecord> iter = readsToWrite.iterator();
while ( iter.hasNext() )
writer.addAlignment(iter.next().getRecord());
return 1;
}
@ -198,13 +204,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// clean the appropriate reads
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes )
updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), altReads.get(indexPair.getFirst()), (int)leftmostIndex);
// write them out
for ( SAMRecord rec : refReads )
writer.addAlignment(rec);
for ( AlignedRead aRec : altReads )
writer.addAlignment(aRec.getRead());
}
// write them out
for ( SAMRecord rec : refReads )
readsToWrite.add(new ComparableSAMRecord(rec));
for ( AlignedRead aRec : altReads )
readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
}
private Pair<Integer, Integer> findBestOffset(String ref, AlignedRead read) {
@ -540,18 +546,18 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
}
// if the best alternate consensus has a smaller sum of quality score mismatches, then clean!
if ( bestConsensus.mismatchSum < totalMismatchSum ) {
if ( bestConsensus != null && bestConsensus.mismatchSum < totalMismatchSum ) {
logger.info("CLEAN: " + bestConsensus.str);
// clean the appropriate reads
for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes )
updateRead(bestConsensus.cigar, bestConsensus.positionOnReference, indexPair.getSecond(), altReads.get(indexPair.getFirst()), (int)leftmostIndex);
// write them out
for ( SAMRecord rec : refReads )
writer.addAlignment(rec);
for ( AlignedRead aRec : altReads )
writer.addAlignment(aRec.getRead());
}
// write them out
for ( SAMRecord rec : refReads )
readsToWrite.add(new ComparableSAMRecord(rec));
for ( AlignedRead aRec : altReads )
readsToWrite.add(new ComparableSAMRecord(aRec.getRead()));
}
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMRecord;
public class ComparableSAMRecord implements Comparable<ComparableSAMRecord> {
private SAMRecord record;
public ComparableSAMRecord(SAMRecord record) {
this.record = record;
}
public SAMRecord getRecord() {
return record;
}
public int compareTo(ComparableSAMRecord o) {
// first sort by start position
GenomeLoc myLoc = new GenomeLoc(record);
GenomeLoc hisLoc = new GenomeLoc(o.getRecord());
int comparison = myLoc.compareTo(hisLoc);
// if the reads have the same start position, we must give a non-zero comparison
// (because java Sets often require "consistency with equals")
if ( comparison == 0 )
comparison = record.getReadName().compareTo(o.getRecord().getReadName());
return comparison;
}
}