diff --git a/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java b/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java index bbe0a035f..8701ecf3c 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java +++ b/java/src/org/broadinstitute/sting/gatk/io/StingSAMFileWriter.java @@ -28,8 +28,4 @@ public interface StingSAMFileWriter extends SAMFileWriter { * @param maxRecordsInRam Max number of records in RAM. */ public void setMaxRecordsInRam(int maxRecordsInRam); - - public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs); - public void setMaxPositionalMoveAllowed(int maxPositionalMoveAllowed); - public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter); } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index d2d9d94aa..03a95c30b 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -36,7 +36,6 @@ import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter; /** * Provides temporary storage for SAMFileWriters. @@ -82,10 +81,6 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage, StingSAMFileWrite */ private boolean writeStarted = false; - /** - * Arguments used for the constrained mate pair fixing file writer - */ - private boolean useConstrainedFileWriter = false; - private int maxInsertSizeForMovingReadPairs = -1; - private int maxPositionalMoveAllowed = -1; - /** * HMM for BAQ, if needed @@ -265,28 +258,4 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite public void close() { outputTracker.getStorage(this).close(); } - - public int getMaxInsertSizeForMovingReadPairs() { - return maxInsertSizeForMovingReadPairs; - } - - public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs) { - this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs; - } - - public int getMaxPositionalMoveAllowed() { - return maxPositionalMoveAllowed; - } - - public void setMaxPositionalMoveAllowed(int maxPositionalMoveAllowed) { - this.maxPositionalMoveAllowed = maxPositionalMoveAllowed; - } - - public boolean useConstrainedFileWriter() { - return useConstrainedFileWriter; - } - - public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter) { - this.useConstrainedFileWriter = useConstrainedFileWriter; - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 488c24fc0..3f2c43c96 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; -public class UnifiedArgumentCollection { +public class UnifiedArgumentCollection { // control the various models to be used @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java new file mode 100755 index 000000000..9fae0c132 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java @@ -0,0 +1,281 @@ +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.picard.sam.SamPairUtil; +import net.sf.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.HashMap; +import java.util.PriorityQueue; +import java.util.Queue; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * A locally resorting, mate fixing sam file writer that supports an idiom where reads are only moved around if + * the ISIZE of the pair is < X and reads are not allowed to move any more than Y bp from their original positions. + * + * To understand this data structure, let's begin by asking -- when are we certain we know the position of read R added + * to the writer and its mate M given that R has been added to the writer (but M may not be), their ISIZE in R, at the + * moment that a read K is added to the writer, under the constraints X and Y? Complex I know. First, because + * reads cannot move more than Y bp in either direction, we know that R originated at most R.pos + Y bp from its + * current position. Also, we know that K is at most K.pos + Y bp from it's original position. If R is maximally + * shifted to the right, and K shifted to the left, then they could at most move 2Y together. So if the distance + * between R and K > 2Y, we know that there are no reads left in the original stream that could be moved before R. + * + * Now, we also need to be certain if we have a mate pair M, that won't emit R before we can incorporate any move of + * M into the mate pair info R. There are two cases to consider here: + * + * If ISIZE > X, we know that we won't move M when we see it, so we can safely emit R knowing that + * M is fixed in place. + * + * If ISIZE <= X, M might be moved, and it we have to wait until we see M in the stream to know it's position. + * So R must be buffered until either M arrives, or we see a read K that's more than 2Y units past the original position + * of M. + * + * So the worst-case memory consumption here is proportional to the number of reads + * occurring between R and M + 2 Y, and so is proportional to the depth of the data and X and Y. + * + * This leads to the following simple algorithm: + * + * addAlignment(newRead): + * addReadToListOfReads(newRead) + * update mate pair of newRead if present in list of reads + * + * for ( read in list of reads [in order of increasing read.pos] ): + * if read.pos < newRead.pos - 2Y && (read.isize >= X || read.matePos < newRead.pos - 2 * Y): + * emit read and remove from list of reads + * else: + * break + * + * @author depristo, ebanks + * @version 0.2 + */ +public class ConstrainedMateFixingManager { + protected static final Logger logger = Logger.getLogger(ConstrainedMateFixingManager.class); + private static final boolean DEBUG = false; + + /** How often do we check whether we want to emit reads? */ + private final static int EMIT_FREQUENCY = 1000; + + /** + * How much could a single read move in position from its original position? + */ + final int MAX_POS_MOVE_ALLOWED; + + /** + * How many reads should we store in memory before flushing the queue? + */ + final int MAX_RECORDS_IN_MEMORY; + + /** how we order our SAM records */ + private final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); + + /** The place where we ultimately write out our records */ + final SAMFileWriter writer; + + /** + * what is the maximum isize of a pair of reads that can move? Reads with isize > this value + * are assumes to not be allowed to move in the incoming read stream. + */ + final int maxInsertSizeForMovingReadPairs; + + final GenomeLocParser genomeLocParser; + private GenomeLoc lastLocFlushed = null; + + int counter = 0; + + /** read.name -> records */ + HashMap forMateMatching = new HashMap(); + Queue waitingReads = new PriorityQueue(1000, comparer); + + //private SimpleTimer timer = new SimpleTimer("ConstrainedWriter"); + //private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + //private long lastProgressPrintTime = -1; // When was the last time we printed progress log? + + + /** + * + * @param writer actual writer + * @param genomeLocParser the GenomeLocParser object + * @param maxInsertSizeForMovingReadPairs max insert size allowed for moving pairs + * @param maxMoveAllowed max positional move allowed for any read + * @param maxRecordsInMemory max records to keep in memory + */ + public ConstrainedMateFixingManager(final SAMFileWriter writer, + final GenomeLocParser genomeLocParser, + final int maxInsertSizeForMovingReadPairs, + final int maxMoveAllowed, + final int maxRecordsInMemory) { + this.writer = writer; + this.genomeLocParser = genomeLocParser; + this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs; + this.MAX_POS_MOVE_ALLOWED = maxMoveAllowed; + this.MAX_RECORDS_IN_MEMORY = maxRecordsInMemory; + + //timer.start(); + //lastProgressPrintTime = timer.currentTime(); + } + + public int getNReadsInQueue() { return waitingReads.size(); } + + public boolean canMoveReads(GenomeLoc earliestPosition) { + if ( DEBUG ) logger.info("Refusing to realign? " + earliestPosition + " vs. " + lastLocFlushed); + + return lastLocFlushed == null || + lastLocFlushed.compareContigs(earliestPosition) != 0 || + lastLocFlushed.distance(earliestPosition) > maxInsertSizeForMovingReadPairs; + } + + private boolean noReadCanMoveBefore(int pos, SAMRecord addedRead) { + return pos + 2 * MAX_POS_MOVE_ALLOWED < addedRead.getAlignmentStart(); + } + + private void writeRead(SAMRecord read) { + try { + writer.addAlignment(read); + } catch (IllegalArgumentException e) { + throw new UserException("If the maximum allowable reads in memory is too small, it may cause reads to be written out of order when trying to write the BAM; please see the --maxReadsInMemory argument for details. " + e.getMessage(), e); + } + } + + public void addRead( SAMRecord newRead ) { + if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP")); + + //final long curTime = timer.currentTime(); + //if ( curTime - lastProgressPrintTime > PROGRESS_PRINT_FREQUENCY ) { + // lastProgressPrintTime = curTime; + // System.out.println("WaitingReads.size = " + waitingReads.size() + ", forMateMatching.size = " + forMateMatching.size()); + //} + + // if the new read is on a different contig or we have too many reads, then we need to flush the queue and clear the map + boolean tooManyReads = getNReadsInQueue() >= MAX_RECORDS_IN_MEMORY; + if ( tooManyReads || (getNReadsInQueue() > 0 && waitingReads.peek().getReferenceIndex() != newRead.getReferenceIndex()) ) { + if ( DEBUG ) logger.warn("Flushing queue on " + (tooManyReads ? "too many reads" : ("move to new contig: " + newRead.getReferenceName())) + " at " + newRead.getAlignmentStart()); + + while ( getNReadsInQueue() > 1 ) { + // emit to disk + writeRead(waitingReads.remove()); + } + + lastLocFlushed = genomeLocParser.createGenomeLoc(waitingReads.peek()); + writeRead(waitingReads.remove()); + + if ( !tooManyReads ) + forMateMatching.clear(); + } + + // fix mates, as needed + // Since setMateInfo can move reads, we potentially need to remove the mate, and requeue + // it to ensure proper sorting + if ( newRead.getReadPairedFlag() ) { + SAMRecord mate = forMateMatching.get(newRead.getReadName()); + if ( mate != null ) { + // Frustratingly, Picard's setMateInfo() method unaligns (by setting the reference contig + // to '*') read pairs when both of their flags have the unmapped bit set. This is problematic + // when trying to emit reads in coordinate order because all of a sudden we have reads in the + // middle of the bam file that now belong at the end - and any mapped reads that get emitted + // after them trigger an exception in the writer. For our purposes, because we shouldn't be + // moving read pairs when they are both unmapped anyways, we'll just not run fix mates on them. + boolean doNotFixMates = newRead.getReadUnmappedFlag() && mate.getReadUnmappedFlag(); + if ( !doNotFixMates ) { + + boolean reQueueMate = mate.getReadUnmappedFlag() && ! newRead.getReadUnmappedFlag(); + if ( reQueueMate ) { + // the mate was unmapped, but newRead was mapped, so the mate may have been moved + // to be next-to newRead, so needs to be reinserted into the waitingReads queue + // note -- this must be called before the setMateInfo call below + if ( ! waitingReads.remove(mate) ) + // we must have hit a region with too much depth and flushed the queue + reQueueMate = false; + } + + // we've already seen our mate -- set the mate info and remove it from the map + SamPairUtil.setMateInfo(mate, newRead, null); + if ( reQueueMate ) waitingReads.add(mate); + } + + forMateMatching.remove(newRead.getReadName()); + } else { + forMateMatching.put(newRead.getReadName(), newRead); + } + } + + waitingReads.add(newRead); + + if ( ++counter % EMIT_FREQUENCY == 0 ) { + while ( ! waitingReads.isEmpty() ) { // there's something in the queue + SAMRecord read = waitingReads.peek(); + + if ( noReadCanMoveBefore(read.getAlignmentStart(), newRead) && + (iSizeTooBigToMove(read) // we won't try to move such a read + || ! read.getReadPairedFlag() // we're not a paired read + || read.getReadUnmappedFlag() && read.getMateUnmappedFlag() // both reads are unmapped + || noReadCanMoveBefore(read.getMateAlignmentStart(), newRead ) ) ) { // we're already past where the mate started + + // remove reads from the map that we have emitted -- useful for case where the mate never showed up + forMateMatching.remove(read.getReadName()); + + if ( DEBUG ) + logger.warn(String.format("EMIT! At %d: read %s at %d with isize %d, mate start %d, op = %s", + newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(), + read.getInferredInsertSize(), read.getMateAlignmentStart(), read.getAttribute("OP"))); + // emit to disk + writeRead(waitingReads.remove()); + } else { + if ( DEBUG ) + logger.warn(String.format("At %d: read %s at %d with isize %d couldn't be emited, mate start %d", + newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(), read.getInferredInsertSize(), read.getMateAlignmentStart())); + break; + } + } + + if ( DEBUG ) logger.warn(String.format("At %d: Done with emit cycle", newRead.getAlignmentStart())); + } + } + + /** + * @param read the read + * @return true if the read shouldn't be moved given the constraints of this SAMFileWriter + */ + public boolean iSizeTooBigToMove(SAMRecord read) { + return iSizeTooBigToMove(read, maxInsertSizeForMovingReadPairs); // we won't try to move such a read + } + + public static boolean iSizeTooBigToMove(SAMRecord read, int maxInsertSizeForMovingReadPairs) { + return ( read.getReadPairedFlag() && ! read.getMateUnmappedFlag() && read.getReferenceName() != read.getMateReferenceName() ) // maps to different chromosomes + || Math.abs(read.getInferredInsertSize()) > maxInsertSizeForMovingReadPairs; // we won't try to move such a read + } + + public void close() { + // write out all of the remaining reads + while ( ! waitingReads.isEmpty() ) { // there's something in the queue + writeRead(waitingReads.remove()); + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 7358c59e6..e273740fb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; -import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -85,8 +84,9 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) protected double MISMATCH_THRESHOLD = 0.15; - @Output(required=false, doc="Output bam") + @Output(required=true, doc="Output bam") protected StingSAMFileWriter writer = null; + protected ConstrainedMateFixingManager manager = null; @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", required=false, doc="Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.") protected boolean USE_KNOWN_INDELS_ONLY = false; @@ -98,6 +98,10 @@ public class IndelRealigner extends ReadWalker { // ADVANCED OPTIONS FOLLOW + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ + "Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false) + protected int MAX_RECORDS_IN_MEMORY = 200000; + @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) protected int MAX_ISIZE_FOR_MOVEMENT = 3000; @@ -149,10 +153,6 @@ public class IndelRealigner extends ReadWalker { // DEPRECATED - @Deprecated - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="This argument is no longer used.", required=false) - protected int DEPRECATED_MAX_IN_RAM = 0; - @Deprecated @Argument(fullName="sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", doc="This argument is no longer used.", required=false) protected boolean DEPRECATED_SORT_IN_COORDINATE_ORDER = false; @@ -210,12 +210,13 @@ public class IndelRealigner extends ReadWalker { private FileWriter statsOutput = null; private FileWriter snpsOutput = null; - protected Map nwayWriters = null; + protected Map nwayWriters = null; // debug info for lazy SW evaluation: private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus) + public void initialize() { if ( LOD_THRESHOLD < 0.0 ) @@ -255,9 +256,9 @@ public class IndelRealigner extends ReadWalker { if ( N_WAY_OUT != null ) { - if ( writer != null ) throw new UserException.BadInput("-nWayOut and -o arguments are mutually exclusive"); + //if ( writer != null ) throw new UserException.BadInput("-nWayOut and -o arguments are mutually exclusive"); - nwayWriters = new HashMap(); + nwayWriters = new HashMap(); Map fname_map = null; @@ -316,15 +317,15 @@ public class IndelRealigner extends ReadWalker { File f = new File(outName); SAMFileHeader header = getToolkit().getSAMFileHeader(rid); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, f); - nwayWriters.put(rid,sw); + SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, f); + nwayWriters.put(rid, new ConstrainedMateFixingManager(sw, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY)); } } - // set up the output writer(s) - if ( writer != null ) - setupWriter(getToolkit().getSAMFileHeader()); + // set up the output writer + setupWriter(getToolkit().getSAMFileHeader()); + manager = new ConstrainedMateFixingManager(writer, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); if ( OUT_INDELS != null ) { try { @@ -378,41 +379,32 @@ public class IndelRealigner extends ReadWalker { writer.writeHeader(header); writer.setPresorted(true); - - writer.setUseConstrainedFileWriter(true); - writer.setMaxInsertSizeForMovingReadPairs(MAX_ISIZE_FOR_MOVEMENT); - writer.setMaxPositionalMoveAllowed(MAX_POS_MOVE_ALLOWED); } private void emit(final SAMRecord read) { try { - if ( writer != null ) - writer.addAlignment(read); - else if ( N_WAY_OUT != null ) { + if ( N_WAY_OUT != null ) { SAMReaderID rid = getToolkit().getReaderIDForRead(read); - SAMFileWriter w = nwayWriters.get(rid); + ConstrainedMateFixingManager m = nwayWriters.get(rid); // reset read's read group from merged to original if read group id collision has happened in merging: if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) { read.setAttribute("RG", getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG"))); } - w.addAlignment(read); + m.addRead(read); + } else { + manager.addRead(read); } } catch (RuntimeIOException e) { throw new UserException.ErrorWritingBamFile(e.getMessage()); } } - private void emit(final Collection reads) { - for ( SAMRecord read : reads ) - emit(read); - } - private void emitReadLists() { // pre-merge lists with priority queue for constrained SAMFileWriter - //logger.warn("EMIT currentInterval " + currentInterval); readsNotToClean.addAll(readsToClean.getReads()); - emit(ReadUtils.coordinateSortReads(readsNotToClean)); + for ( SAMRecord read : ReadUtils.coordinateSortReads(readsNotToClean) ) + emit(read); readsToClean.clear(); readsNotToClean.clear(); } @@ -475,11 +467,15 @@ public class IndelRealigner extends ReadWalker { read.getReadFailsVendorQualityCheckFlag() || read.getMappingQuality() == 0 || read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - ConstrainedMateFixingSAMFileWriter.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT); + ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT); } private void cleanAndCallMap(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { - clean(readsToClean); + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); + } knownIndelsToTry.clear(); indelRodsSeen.clear(); @@ -507,14 +503,18 @@ public class IndelRealigner extends ReadWalker { } public void onTraversalDone(Integer result) { - if ( readsToClean.size() > 0 || readsNotToClean.size() > 0 ) { - clean(readsToClean); - knownIndelsToTry.clear(); - indelRodsSeen.clear(); - + if ( readsToClean.size() > 0 ) { + GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); + if ( manager.canMoveReads(earliestPossibleMove) ) + clean(readsToClean); emitReadLists(); + } else if ( readsNotToClean.size() > 0 ) { + emitReadLists(); } + knownIndelsToTry.clear(); + indelRodsSeen.clear(); + if ( OUT_INDELS != null ) { try { indelOutput.close(); @@ -539,12 +539,15 @@ public class IndelRealigner extends ReadWalker { if ( N_WAY_OUT != null ) { try { - for ( SAMFileWriter w : nwayWriters.values() ) - w.close(); + for ( ConstrainedMateFixingManager m : nwayWriters.values() ) + m.close(); } catch (RuntimeIOException e) { throw new UserException.ErrorWritingBamFile(e.getMessage()); } + } else { + manager.close(); } + if ( CHECKEARLY ) { logger.info("SW alignments runs: "+SWalignmentRuns); logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)"); @@ -601,7 +604,7 @@ public class IndelRealigner extends ReadWalker { return; byte[] reference = readsToClean.getReference(referenceReader); - int leftmostIndex = (int)readsToClean.getLocation().getStart(); + int leftmostIndex = readsToClean.getLocation().getStart(); final ArrayList refReads = new ArrayList(); // reads that perfectly match ref final ArrayList altReads = new ArrayList(); // reads that don't perfectly match diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 29d946410..0f16de587 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -29,9 +29,7 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; -import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.Utils; @@ -48,58 +46,10 @@ public class AlignmentUtils { public long mismatchQualities = 0; } - /** Returns number of mismatches in the alignment r to the reference sequence - * refSeq assuming the alignment starts at (ZERO-based) position refIndex on the - * specified reference sequence; in other words, refIndex is used in place of alignment's own - * getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment r - * (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected. - * - * THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS. - * - * @param r alignment - * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of - * the reference string, IndexOutOfBound exception will be thrown at runtime). - * @param refIndex zero-based position, at which the alignment starts on the specified reference string. - * @return the number of mismatches - */ - public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) { - return getMismatchCount(r, refSeq, refIndex).numMismatches; - } - - /** Same as #numMismatches(SAMRecord, byte[], refIndex), but counts mismatches only along the partial stretch - * on the read of length nReadBases starting at (0-based) position readIndex. - * @param r Aligned read to count mismatches for - * @param refSeq Chunk of reference sequence that subsumes the alignment - * @param refIndex Zero-based position on refSeq where the alignment for the whole read starts - * @param readIndex Zero-based position on the read, the mismatches will be counted only from this position on - * @param nReadBases Length of continuous stretch on the read, along which mismatches will be counted - * @return - */ - public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) { - if ( r.getReadUnmappedFlag() ) return 1000000; - return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).numMismatches; - } - - public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) { - return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).mismatchQualities; - } - - @Deprecated - public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) { - if ( r.getReadUnmappedFlag() ) return 1000000; - return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); - } - public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r, refSeq, refIndex).mismatchQualities; } - @Deprecated - public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) { - if ( r.getReadUnmappedFlag() ) return 1000000; - return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); - } - public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); } @@ -191,8 +141,8 @@ public class AlignmentUtils { public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) { int sum = 0; - int windowStart = (int)ref.getWindow().getStart(); - int windowStop = (int)ref.getWindow().getStop(); + int windowStart = ref.getWindow().getStart(); + int windowStop = ref.getWindow().getStop(); byte[] refBases = ref.getBases(); byte[] readBases = p.getRead().getReadBases(); byte[] readQualities = p.getRead().getBaseQualities(); @@ -262,11 +212,11 @@ public class AlignmentUtils { // it's possible we aren't starting at the beginning of a read, // and we don't need to look at any of the previous context outside our window // (although we do need future context) - int readStartPos = Math.max(read.getAlignmentStart(), (int)ref.getLocus().getStart() - windowSize); + int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize); int currentReadPos = read.getAlignmentStart(); byte[] refBases = ref.getBases(); - int refIndex = readStartPos - (int)ref.getWindow().getStart(); + int refIndex = readStartPos - ref.getWindow().getStart(); if ( refIndex < 0 ) { throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus()); } @@ -381,46 +331,6 @@ public class AlignmentUtils { return n; } - @Deprecated - public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) { - - final char[] alignment = new char[read.length]; - int refPos = 0; - int alignPos = 0; - - for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) { - - final CigarElement ce = cigar.getCigarElement(iii); - final int elementLength = ce.getLength(); - - switch( ce.getOperator() ) { - case I: - case S: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { - alignment[alignPos++] = '+'; - } - break; - case D: - case N: - refPos++; - break; - case M: - for ( int jjj = 0 ; jjj < elementLength; jjj++ ) { - alignment[alignPos] = ref[refPos]; - alignPos++; - refPos++; - } - break; - case H: - case P: - break; - default: - throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() ); - } - } - return alignment; - } - public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) { final byte[] alignment = new byte[read.length]; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index e70ca9d14..7a3aa085f 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.util.Arrays; @@ -101,4 +102,28 @@ public class IndelRealignerIntegrationTest extends WalkerTest { Arrays.asList("be859f9a98d738becee0526887cae42e")); executeTest("realigner long run", spec); } + + @Test(dependsOnMethods = { "testStats" }) + public void testMaxReadsInMemory() { + HashMap e = new HashMap(); + e.put( "--maxReadsInMemory 10000", "424271ca000442e8b338c67d95dadb82" ); + e.put( "--maxReadsInMemory 40000", base_md5 ); + + for ( Map.Entry entry : e.entrySet() ) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + entry.getKey(), + 1, + Arrays.asList(entry.getValue())); + executeTest(String.format("realigner [%s]", entry.getKey()), spec); + } + } + + @Test(expectedExceptions = { RuntimeException.class }, dependsOnMethods = { "testMaxReadsInMemory" }) + public void testFailure() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + "--maxReadsInMemory 1000", + 1, + Arrays.asList("")); + executeTest("realigner failure", spec); + } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java index df90554a9..fd5ad0b22 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java @@ -31,6 +31,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -maxConsensuses 100" + " -greedy 100" + " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-5,650,000" + " -targetIntervals " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.realigner.intervals", @@ -45,6 +46,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -maxConsensuses 100" + " -greedy 100" + " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L chr1:1-150,000,000" + " -targetIntervals " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.realigner.intervals",