From 29f3ad72f3fcc1856c243d06d350da12d7aba510 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 4 Feb 2011 22:27:05 +0000 Subject: [PATCH] SAMFileWriter that allows the user to move reads, but only a bit, in an incoming coordinated sorted BAM files. Does some local reordering and local mate fixing, under specified constrained. These constrains allow us to make a special -- under testing for Eric, who promised to try this out a bit, expand test cases and integration tests -- but soon to be the default and only model of the realigner that only moves reads with ISIZE < 3000 that directly emits a coordinate sorted, mate fixed validating BAM file without needing FixMates externally. Preliminary testing shows this runs in a totally fine amount of memory and produces equivalent results to the previous version. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5199 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/io/StingSAMFileWriter.java | 3 + .../gatk/io/storage/SAMFileWriterStorage.java | 7 +- .../gatk/io/stubs/SAMFileWriterStub.java | 21 ++ .../gatk/walkers/indels/IndelRealigner.java | 58 +++- .../tools/CompareBAMAlignments.java | 96 ++++++ .../ConstrainedMateFixingSAMFileWriter.java | 293 ++++++++++++++++++ .../sting/utils/sam/GATKSAMRecord.java | 15 +- ...rainedMateFixingSAMFileWriterUnitTest.java | 141 +++++++++ 8 files changed, 614 insertions(+), 20 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java create mode 100644 java/src/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriter.java create mode 100644 java/test/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriterUnitTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java index 8701ecf3c..f472c962c 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java +++ b/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java @@ -28,4 +28,7 @@ public interface StingSAMFileWriter extends SAMFileWriter { * @param maxRecordsInRam Max number of records in RAM. */ public void setMaxRecordsInRam(int maxRecordsInRam); + + public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs); + public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter); } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index 65ba31b5c..fa98288c2 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -33,6 +33,7 @@ import java.io.*; import net.sf.samtools.util.RuntimeIOException; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter; /** * Provides temporary storage for SAMFileWriters. @@ -42,7 +43,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; */ public class SAMFileWriterStorage implements SAMFileWriter, Storage { private final File file; - private final SAMFileWriter writer; + private SAMFileWriter writer; public SAMFileWriterStorage( SAMFileWriterStub stub ) { this(stub,stub.getSAMFile()); @@ -79,6 +80,10 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage, StingSAMFileWrite outputTracker.getStorage(this).close(); } + // + // Experimental arguments for the constrained SAMFileWriter + // + private boolean useConstrainedFileWriter = false; + private int maxInsertSizeForMovingReadPairs = -1; + + public int getMaxInsertSizeForMovingReadPairs() { + return maxInsertSizeForMovingReadPairs; + } + + public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs) { + this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs; + } + + public boolean useConstrainedFileWriter() { + return useConstrainedFileWriter; + } + + public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter) { + this.useConstrainedFileWriter = useConstrainedFileWriter; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index b864cccee..ef0035d38 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -51,6 +51,7 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; +import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -172,6 +173,16 @@ public class IndelRealigner extends ReadWalker { @Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false) protected String OUT_SNPS = null; + // + // Experimental output constraints + // + // TODO -- eric promised me he'll validate this further and release to the world as the only option to do cleaning + // + @Hidden + @Argument(fullName="constrainMovement", shortName="CM", required=false, doc="If provided, we'll try the experimental constraining output system") + protected boolean CONSTRAIN_MOVEMENT = false; + protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + // fasta reference reader to supplement the edges of the reference sequence private IndexedFastaSequenceFile referenceReader; @@ -354,7 +365,7 @@ public class IndelRealigner extends ReadWalker { private SAMFileHeader setupHeader(SAMFileHeader header) { if ( DO_NOT_SORT ) header.setSortOrder(SAMFileHeader.SortOrder.unsorted); - else if ( SORT_IN_COORDINATE_ORDER ) + else if ( SORT_IN_COORDINATE_ORDER || CONSTRAIN_MOVEMENT ) header.setSortOrder(SAMFileHeader.SortOrder.coordinate); else header.setSortOrder(SAMFileHeader.SortOrder.queryname); @@ -384,8 +395,13 @@ public class IndelRealigner extends ReadWalker { } writer.writeHeader(header); - writer.setPresorted(false); + writer.setPresorted(CONSTRAIN_MOVEMENT); writer.setMaxRecordsInRam(MAX_RECORDS_IN_RAM); + + if ( CONSTRAIN_MOVEMENT ) { + writer.setUseConstrainedFileWriter(true); + writer.setMaxInsertSizeForMovingReadPairs(MAX_ISIZE_FOR_MOVEMENT); + } } private void emit(final SAMRecord read) { @@ -407,11 +423,19 @@ public class IndelRealigner extends ReadWalker { } } - private void emit(final List reads) { + private void emit(final Collection reads) { for ( SAMRecord read : reads ) emit(read); } + private void emitReadLists() { + // pre-merge lists with priority queue for constrained SAMFileWriter + readsNotToClean.addAll(readsToClean.getReads()); + emit(readsNotToClean); + readsToClean.clear(); + readsNotToClean.clear(); + } + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); @@ -459,21 +483,25 @@ public class IndelRealigner extends ReadWalker { } private void abortCleanForCurrentInterval() { - emit(readsNotToClean); - emit(readsToClean.getReads()); - readsToClean.clear(); - readsNotToClean.clear(); + emitReadLists(); currentInterval = intervals.hasNext() ? intervals.next() : null; } private boolean doNotTryToClean(SAMRecord read) { - return read.getReadUnmappedFlag() || + boolean immobileReadForWriting = CONSTRAIN_MOVEMENT && ConstrainedMateFixingSAMFileWriter.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT); + + boolean old = read.getReadUnmappedFlag() || read.getNotPrimaryAlignmentFlag() || read.getReadFailsVendorQualityCheckFlag() || read.getMappingQuality() == 0 || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || (!REALIGN_BADLY_MATED_READS && BadMateFilter.hasBadMate(read)); + +// if ( immobileReadForWriting && ! old) +// logger.warn("Newly skipping read: " + read + " isize = " + read.getInferredInsertSize()); + + return old || immobileReadForWriting; } private void cleanAndCallMap(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { @@ -481,11 +509,7 @@ public class IndelRealigner extends ReadWalker { knownIndelsToTry.clear(); indelRodsSeen.clear(); - emit(readsNotToClean); - emit(readsToClean.getReads()); - readsToClean.clear(); - readsNotToClean.clear(); - + emitReadLists(); try { do { currentInterval = intervals.hasNext() ? intervals.next() : null; @@ -514,9 +538,11 @@ public class IndelRealigner extends ReadWalker { knownIndelsToTry.clear(); indelRodsSeen.clear(); - // merge the two sets for emission - readsNotToClean.addAll(readsToClean.getReads()); - emit(readsNotToClean); + emitReadLists(); + // why was this different than the other emits? +// // merge the two sets for emission +// readsNotToClean.addAll(readsToClean.getReads()); +// emit(readsNotToClean); } if ( OUT_INDELS != null ) { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java b/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java new file mode 100755 index 000000000..bd1b88dff --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/tools/CompareBAMAlignments.java @@ -0,0 +1,96 @@ +package org.broadinstitute.sting.oneoffprojects.tools; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.CommandLineProgram; + +import java.io.*; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; + +/** + * Used to test how long it takes to read through a text files and gzipped files. + * If the passed-in filename ends with .gz, it will be read using GZIPInputStream. + * Otherwise, its read using FileReader. + */ +public class CompareBAMAlignments extends CommandLineProgram { + + @Argument(fullName = "input", shortName = "i", doc = "xxx", required = true) + public List filenames; + + @Argument(fullName = "maxIsize", shortName = "s", doc = "xxx", required=false) + public int maxISize = -1; + + @Argument(fullName = "incr", shortName = "incr", doc = "xxx", required=false) + int incr = -1; + + @Override + protected int execute() { + try { + List> readers = new ArrayList>(); + for ( String filename : filenames ) { + final File file = new File(filename); + SAMFileReader reader = new SAMFileReader(file); + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + readers.add(reader.iterator()); + } + + System.out.println("Reading..."); + int next = incr; + int counter = 0; + + while( true ) { + List reads = new ArrayList(); + for ( Iterator reader : readers ) { + if ( ! reader.hasNext() ) System.exit(0); + reads.add(reader.next()); + } + + // comparing + SAMRecord read1 = reads.get(0); + if ( read1.getInferredInsertSize() > maxISize ) { + for ( SAMRecord read : reads ) { + if(incr > 0 && counter % incr == 0) { + next += incr; + System.err.println(new Date() + " - counter " + counter); + System.err.println("read: " + read.format()); + } + + if ( ! read1.getReadName().equals(read.getReadName()) ) + bad(read1, read, "Names not equal"); + if ( read1.getAlignmentStart() != read.getAlignmentStart() ) + bad(read1, read, "Alignment starts not equal"); + if ( ! read1.getCigarString().equals(read.getCigarString()) ) + bad(read1, read, "Unequal CIGAR strings"); + } + } + counter++; + } + } catch(Exception e) { + System.err.println("ERROR: " + e); + e.printStackTrace(); + } + + return 0; + } + + private void bad(SAMRecord read1, SAMRecord read2, String msg) { + System.out.printf("%nBAD: %s%n", msg); + System.out.printf(" read1: %s %s %s%n", read1.getReadName(), read1.getAlignmentStart(), read1.getCigarString()); + System.out.printf(" read2: %s %s %s%n", read2.getReadName(), read2.getAlignmentStart(), read2.getCigarString()); + // System.exit(1); + } + + public static void main(String args[]) + { + try { + CommandLineProgram.start(new CompareBAMAlignments(), args); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriter.java b/java/src/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriter.java new file mode 100644 index 000000000..c05e691b3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriter.java @@ -0,0 +1,293 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.picard.sam.SamPairUtil; +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.util.*; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * A locally resorting, mate fixing sam file writer that supports an idiom where reads are only moved around if + * the ISIZE of the pair is < X and reads are not allowed to move any more than Y bp from their original positions. + * + * To understand this data structure, let's begin by asking -- when are we certain we know the position of read R added + * to the writer and its mate M given that R has been added to the writer (but M may not be), their ISIZE in R, at the + * moment that a read K is added to the writer, under the constraints X and Y? Complex I know. First, because + * reads cannot move more than Y bp in either direction, we know that R originated at most R.pos + Y bp from its + * current position. Also, we know that K is at most K.pos + Y bp from it's original position. If R is maximally + * shifted to the right, and K shifted to the left, then they could at most move 2Y together. So if the distance + * between R and K > 2Y, we know that there are no reads left in the original stream that could be moved before R. + * + * Now, we also need to be certain if we have a mate pair M, that won't emit R before we can incorporate any move of + * M into the mate pair info R. There are two cases to consider here: + * + * If ISIZE > X, we know that we won't move M when we see it, so we can safely emit R knowing that + * M is fixed in place. + * + * If ISIZE <= X, M might be moved, and it we have to wait until we see M in the stream to know it's position. + * So R must be buffered until either M arrives, or we see a read K that's more than 2Y units past the original position + * of M. + * + * So the worst-case memory consumption here is proportional to the number of reads + * occurring between R and M + 2 Y, and so is proportional to the depth of the data and X and Y. + * + * This leads to the following simple algorithm: + * + * addAlignment(newRead): + * addReadToListOfReads(newRead) + * update mate pair of newRead if present in list of reads + * + * for ( read in list of reads [in order of increasing read.pos] ): + * if read.pos < newRead.pos - 2Y && (read.isize >= X || read.matePos < newRead.pos - 2 * Y): + * emit read and remove from list of reads + * else: + * break + * + * @author depristo + * @version 0.1 + */ +public class ConstrainedMateFixingSAMFileWriter implements SAMFileWriter { + final protected static Logger logger = Logger.getLogger(ConstrainedMateFixingSAMFileWriter.class); + private final static boolean DEBUG = false; + private final static boolean PRINT_COUNTER = true; + + /** How often do we check whether we want to emit reads? */ + private final static int EMIT_FREQUENCY = 1000; + + /** + * How much could a single read move in position from its original position? + * todo -- this really should be a provided parameter + */ + private final static int MAX_POS_MOVE_ALLOWED = 200; + + /** how we order our SAM records */ + private final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); + + // todo -- remove test comparer +// private static class MySAMRecordCoordinateComparator extends SAMRecordCoordinateComparator { +// @Override +// public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { +// int cmp = super.fileOrderCompare(samRecord1, samRecord2); +// int cmpPos = new Integer(samRecord1.getAlignmentStart()).compareTo(samRecord2.getAlignmentStart()); +// if ( Math.signum(cmp) != Math.signum(cmpPos) ) +// logger.info(String.format("Comparing %d to %d => %d cmp and %d cmpPos", +// samRecord1.getAlignmentStart(), samRecord2.getAlignmentStart(), cmp, cmpPos)); +// return cmp; +// } +// } + + /** The place where we ultimately write out our records */ + final SAMFileWriter finalDestination; + + /** + * what is the maximum isize of a pair of reads that can move? Reads with isize > this value + * are assumes to not be allowed to move in the incoming read stream. + */ + final int maxInsertSizeForMovingReadPairs; + + int counter = 0; + int maxReadsInQueue = 0; + + /** read.name -> records */ + HashMap forMateMatching = new HashMap(); + //Queue waitingReads = new LinkedList(); + Queue waitingReads = new PriorityQueue(1000, comparer); + + + /** + * + * @param header + * @param outputFile + * @param compressionLevel + * @param maxInsertSizeForMovingReadPairs + */ + public ConstrainedMateFixingSAMFileWriter(final SAMFileHeader header, + final File outputFile, + final int compressionLevel, + final int maxInsertSizeForMovingReadPairs) { + this(new SAMFileWriterFactory().makeBAMWriter(header, true, outputFile, compressionLevel), + maxInsertSizeForMovingReadPairs); + } + + public ConstrainedMateFixingSAMFileWriter(final SAMFileWriter finalDestination, + final int maxInsertSizeForMovingReadPairs) { + this.finalDestination = finalDestination; + this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs; + } + + public int getMaxReadsInQueue() { return maxReadsInQueue; } + public int getNReadsInQueue() { return waitingReads.size(); } + + /** + * Retrieves the header to use when creating the new SAM file. + * @return header to use when creating the new SAM file. + */ + @Override + public SAMFileHeader getFileHeader() { + return finalDestination.getFileHeader(); + } + + private boolean noReadCanMoveBefore(int pos, SAMRecord addedRead) { + return pos + 2 * MAX_POS_MOVE_ALLOWED < addedRead.getAlignmentStart(); + } + +// private void verifyOrdering() { +// SAMRecord lastRead = null; +// List reads = new ArrayList(); +// +// reads.addAll(waitingReads); +// Collections.sort(reads, comparer); +// for ( SAMRecord read : reads ) { +// logger.info("READ is " + read.getReadName() + " pos " + read.getAlignmentStart()); +// if ( lastRead != null && comparer.fileOrderCompare(lastRead, read) > 0 ) +// throw new ReviewedStingException("BUG: records added out of order: read1=" + lastRead + +// ", pos=" + lastRead.getAlignmentStart() + " read2="+read + ", pos=" + read.getAlignmentStart()); +// lastRead = read; +// } +// +//// List reads = new ArrayList(); +//// while ( waitingReads.peek() != null ) { +//// SAMRecord read = waitingReads.poll(); +//// logger.info("READ is " + read.getReadName() + " pos " + read.getAlignmentStart()); +//// if ( lastRead != null && comparer.fileOrderCompare(lastRead, read) > 0 ) +//// throw new ReviewedStingException("BUG: records added out of order: read1=" + lastRead + +//// ", pos=" + lastRead.getAlignmentStart() + " read2="+read + ", pos=" + read.getAlignmentStart()); +//// lastRead = read; +//// reads.add(read); +//// } +// +// for ( SAMRecord read : reads ) waitingReads.add(read); +// } + + /** + * @{inheritDoc} + */ + @Override + public void addAlignment( SAMRecord newRead ) { + if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart()); + +// if ( newRead.getReadName().equals("ERR019492.23181457") ) +// logger.warn("foo"); + + + // fix mates, as needed + // Since setMateInfo can move reads, we potentially need to remove the mate, and requeue + // it to ensure proper sorting + if ( newRead.getReadPairedFlag() ) { + SAMRecord mate = forMateMatching.get(newRead.getReadName()); + if ( mate != null ) { + boolean reQueueMate = mate.getReadUnmappedFlag() && ! newRead.getReadUnmappedFlag(); + if ( reQueueMate ) { + // the mate was unmapped, but newRead was mapped, so the mate may have been moved + // to be next-to newRead, so needs to be reinserted into the waitingReads queue + // note -- this must be called before the setMateInfo call below + if ( ! waitingReads.remove(mate) ) + throw new ReviewedStingException("BUG: remove of mate failed at " + mate); + } + + // we've already seen our mate -- set the mate info and remove it from the map + SamPairUtil.setMateInfo(mate, newRead, null); + if ( reQueueMate ) waitingReads.add(mate); + forMateMatching.remove(newRead.getReadName()); + } else { + forMateMatching.put(newRead.getReadName(), newRead); + } + } + + waitingReads.add(newRead); +// logger.warn("GATKSamRecord newRead.equals(newread) = " + newRead.equals(newRead)); +// if ( ! waitingReads.remove(newRead) ) +// throw new ReviewedStingException("BUG: remove of failed at " + newRead); +// waitingReads.add(newRead); + maxReadsInQueue = Math.max(maxReadsInQueue, waitingReads.size()); + + if ( PRINT_COUNTER && counter++ % 10000 == 0 ) + logger.warn("Reads in queue " + waitingReads.size() + " max " + maxReadsInQueue); + + if ( counter % EMIT_FREQUENCY == 0 ) { + //verifyOrdering(); + while ( ! waitingReads.isEmpty() ) { // there's something in the queue + SAMRecord read = waitingReads.peek(); + //logger.info("Examining read at " + read.getAlignmentStart()); + + if ( noReadCanMoveBefore(read.getAlignmentStart(), newRead) && + (iSizeTooBigToMove(read) // we won't try to move such a read + || ! read.getReadPairedFlag() // we're not a paired read + || read.getReadUnmappedFlag() && read.getMateUnmappedFlag() // both reads are unmapped + || noReadCanMoveBefore(read.getMateAlignmentStart(), newRead ) ) ) { // we're already past where the mate started +// if ( read.getReadName().equals("20FUKAAXX100202:2:64:2458:35096") ) +// logger.warn("foo"); + + // remove reads from the map that we have emitted -- useful for case where the mate never showed up + forMateMatching.remove(read.getReadName()); + + if ( DEBUG ) + logger.warn(String.format("EMIT! At %d: read %s at %d with isize %d, mate start %d", + newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(), read.getInferredInsertSize(), read.getMateAlignmentStart())); + // emit to disk + finalDestination.addAlignment(waitingReads.remove()); + } else { + if ( DEBUG ) + logger.warn(String.format("At %d: read %s at %d with isize %d couldn't be emited, mate start %d", + newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(), read.getInferredInsertSize(), read.getMateAlignmentStart())); + break; + } + } + + if ( DEBUG ) logger.warn(String.format("At %d: Done with emit cycle", newRead.getAlignmentStart())); + } + } + + /** + * Returns true if the read shouldn't be moved given the constraints of this SAMFileWriter + * @param read + * @return + */ + public boolean iSizeTooBigToMove(SAMRecord read) { + return iSizeTooBigToMove(read, maxInsertSizeForMovingReadPairs); // we won't try to move such a read + } + + public static boolean iSizeTooBigToMove(SAMRecord read, int maxInsertSizeForMovingReadPairs) { + return ( read.getReadPairedFlag() && ! read.getMateUnmappedFlag() && read.getReferenceName() != read.getMateReferenceName() ) // maps to different chromosomes + || Math.abs(read.getInferredInsertSize()) > maxInsertSizeForMovingReadPairs; // we won't try to move such a read + } + + /** + * @{inheritDoc} + */ + @Override + public void close() { + // write out all of the remaining reads + while ( ! waitingReads.isEmpty() ) { // there's something in the queue + finalDestination.addAlignment(waitingReads.remove()); + } + finalDestination.close(); + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 2ce8653b1..09e20e82c 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -132,8 +132,8 @@ public class GATKSAMRecord extends SAMRecord { return mUnmappedFlag; } - public void setReadUmappedFlag(boolean b) { - mRecord.setReadUmappedFlag(b); + public void setReadUnmappedFlag(boolean b) { + mRecord.setReadUnmappedFlag(b); mUnmappedFlag = b; } @@ -432,7 +432,16 @@ public class GATKSAMRecord extends SAMRecord { public List validateCigar(long l) { return mRecord.validateCigar(l); } - public boolean equals(Object o) { return mRecord.equals(o); } + @Override + public boolean equals(Object o) { + if (this == o) return true; + + // note -- this forbids a GATKSAMRecord being equal to its underlying SAMRecord + if (!(o instanceof GATKSAMRecord)) return false; + + // note that we do not consider the GATKSAMRecord internal state at all + return mRecord.equals(((GATKSAMRecord)o).mRecord); + } public int hashCode() { return mRecord.hashCode(); } diff --git a/java/test/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriterUnitTest.java new file mode 100644 index 000000000..27a4be1c8 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/sam/ConstrainedMateFixingSAMFileWriterUnitTest.java @@ -0,0 +1,141 @@ +package org.broadinstitute.sting.utils.sam; + +import com.sun.xml.internal.messaging.saaj.packaging.mime.util.OutputUtil; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.picard.sam.SamFileValidator; +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class ArtificialSAMFileWriter + *

+ * Test out the ArtificialSAMFileWriter class + */ +public class ConstrainedMateFixingSAMFileWriterUnitTest extends BaseTest { + final int MAX_TEMP_FILES = 10; + IndexedFastaSequenceFile fasta = null; + SAMFileReader bamIn; + + File referenceFile = new File("/Users/depristo/Desktop/broadLocal/localData/Homo_sapiens_assembly18.fasta"); // todo -- replace me with network version + final File BAM_FILE = new File("/Users/depristo/Desktop/broadLocal/GATK/trunk/HiSeq.test.bam"); + final File OUTPUT_FILE = new File("/Users/depristo/Desktop/broadLocal/GATK/trunk/HiSeq.1mb.CMF.bam"); + +// File referenceFile = new File(hg18Reference); +// final File BAM_FILE = new File(validationDataLocation + "HiSeq.1mb.bam"); +// final File OUTPUT_FILE = new File("HiSeq.1mb.CMF.bam"); + + final int MAX_ISIZE_FOR_MOVE = 1000; + + @BeforeMethod + public void beforeMethod(Object[] data) { + if (OUTPUT_FILE.exists()) OUTPUT_FILE.delete(); + bamIn = new SAMFileReader(BAM_FILE); + + try { + fasta = new IndexedFastaSequenceFile(referenceFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(referenceFile,ex); + } + } + + private ConstrainedMateFixingSAMFileWriter makeWriter(final int maxInsertSizeForMovingReadPairs) { + return new ConstrainedMateFixingSAMFileWriter(bamIn.getFileHeader(), OUTPUT_FILE, 5, maxInsertSizeForMovingReadPairs); + } + + private List readBAM(File file) { + List l = new ArrayList(); + for ( SAMRecord read : new SAMFileReader(file) ) { l.add(read); } + return l; + } + + private void assertBamsAreEqual(File bam1File, File bam2File) { + List reads1 = readBAM(bam1File); + List reads2 = readBAM(bam2File); + Assert.assertEquals(reads1.size(), reads2.size()); + + for ( SAMRecord read1 : reads1 ) + Assert.assertTrue(reads2.contains(read1), "Couldn't find equal read in new BAM " + read1); + } + + private void writeReads(Collection reads) { + ConstrainedMateFixingSAMFileWriter writer = makeWriter(MAX_ISIZE_FOR_MOVE); + for ( SAMRecord read : reads ) { + writer.addAlignment(read); + } + writer.close(); + logger.warn("Max reads in memory: " + writer.getMaxReadsInQueue()); + } + + private void validateOutput(final File bamFile) { + SamFileValidator validator = new SamFileValidator(new PrintWriter(System.err), MAX_TEMP_FILES); + validator.setErrorsToIgnore(Arrays.asList(SAMValidationError.Type.INVALID_TAG_NM, SAMValidationError.Type.MATE_NOT_FOUND)); + boolean validated = validator.validateSamFileVerbose(new SAMFileReader(bamFile), fasta); + Assert.assertTrue(validated, "SAM file failed to validate"); + } + + @Test(enabled = true) + public void unmodifiedWrite() { + writeReads(readBAM(BAM_FILE)); + validateOutput(OUTPUT_FILE); + assertBamsAreEqual(BAM_FILE, OUTPUT_FILE); + } + + @Test(enabled = true) + public void writeResortingOnTheFlyNoPairs() { + List reads = readBAM(BAM_FILE); + for ( SAMRecord read : reads ) { + if ( ! ConstrainedMateFixingSAMFileWriter.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVE) ) + read.setAlignmentStart(read.getAlignmentStart() - 10); + } + //writeReads(Utils.sortedSAMRecords(reads)); + writeReads(reads); + validateOutput(OUTPUT_FILE); + } +}