New attempt at the constrained movement version of the indel realigner (I've kept around the old writer for now). The new contract is that the realigner must ask permission before trying to clean an area; permission will be denied by the CM-Manager if it was required to flush its cache of reads because of too much depth within a distance of maxInsertSizeForMovingReadPairs. Added integration tests to cover different max cache sizes, including an expected exception when too small a value is chosen. The actual logic changes were fairly minor - much of this commit is really just some cleanup. I'd like to throw 1000G Phase I at it, but will respectfully wait for Ryan to hit his deadline before doing so.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5414 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
204582bcd5
commit
3596c56602
|
|
@ -28,8 +28,4 @@ public interface StingSAMFileWriter extends SAMFileWriter {
|
|||
* @param maxRecordsInRam Max number of records in RAM.
|
||||
*/
|
||||
public void setMaxRecordsInRam(int maxRecordsInRam);
|
||||
|
||||
public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs);
|
||||
public void setMaxPositionalMoveAllowed(int maxPositionalMoveAllowed);
|
||||
public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter);
|
||||
}
|
||||
|
|
@ -36,7 +36,6 @@ import net.sf.samtools.util.RuntimeIOException;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter;
|
||||
|
||||
/**
|
||||
* Provides temporary storage for SAMFileWriters.
|
||||
|
|
@ -82,10 +81,6 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage<SAMFileWrite
|
|||
}
|
||||
else
|
||||
throw new UserException("Unable to write to SAM file; neither a target file nor a stream has been specified");
|
||||
|
||||
if ( stub.useConstrainedFileWriter() ) {
|
||||
this.writer = new ConstrainedMateFixingSAMFileWriter(writer, stub.getMaxInsertSizeForMovingReadPairs(), stub.getMaxPositionalMoveAllowed());
|
||||
}
|
||||
}
|
||||
|
||||
public SAMFileHeader getFileHeader() {
|
||||
|
|
|
|||
|
|
@ -101,13 +101,6 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
*/
|
||||
private boolean writeStarted = false;
|
||||
|
||||
/**
|
||||
* Arguments used for the constrained mate pair fixing file writer
|
||||
*/
|
||||
private boolean useConstrainedFileWriter = false;
|
||||
private int maxInsertSizeForMovingReadPairs = -1;
|
||||
private int maxPositionalMoveAllowed = -1;
|
||||
|
||||
|
||||
/**
|
||||
* HMM for BAQ, if needed
|
||||
|
|
@ -265,28 +258,4 @@ public class SAMFileWriterStub implements Stub<SAMFileWriter>, StingSAMFileWrite
|
|||
public void close() {
|
||||
outputTracker.getStorage(this).close();
|
||||
}
|
||||
|
||||
public int getMaxInsertSizeForMovingReadPairs() {
|
||||
return maxInsertSizeForMovingReadPairs;
|
||||
}
|
||||
|
||||
public void setMaxInsertSizeForMovingReadPairs(int maxInsertSizeForMovingReadPairs) {
|
||||
this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs;
|
||||
}
|
||||
|
||||
public int getMaxPositionalMoveAllowed() {
|
||||
return maxPositionalMoveAllowed;
|
||||
}
|
||||
|
||||
public void setMaxPositionalMoveAllowed(int maxPositionalMoveAllowed) {
|
||||
this.maxPositionalMoveAllowed = maxPositionalMoveAllowed;
|
||||
}
|
||||
|
||||
public boolean useConstrainedFileWriter() {
|
||||
return useConstrainedFileWriter;
|
||||
}
|
||||
|
||||
public void setUseConstrainedFileWriter(boolean useConstrainedFileWriter) {
|
||||
this.useConstrainedFileWriter = useConstrainedFileWriter;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,281 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.picard.sam.SamPairUtil;
|
||||
import net.sf.samtools.*;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Queue;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A locally resorting, mate fixing sam file writer that supports an idiom where reads are only moved around if
|
||||
* the ISIZE of the pair is < X and reads are not allowed to move any more than Y bp from their original positions.
|
||||
*
|
||||
* To understand this data structure, let's begin by asking -- when are we certain we know the position of read R added
|
||||
* to the writer and its mate M given that R has been added to the writer (but M may not be), their ISIZE in R, at the
|
||||
* moment that a read K is added to the writer, under the constraints X and Y? Complex I know. First, because
|
||||
* reads cannot move more than Y bp in either direction, we know that R originated at most R.pos + Y bp from its
|
||||
* current position. Also, we know that K is at most K.pos + Y bp from it's original position. If R is maximally
|
||||
* shifted to the right, and K shifted to the left, then they could at most move 2Y together. So if the distance
|
||||
* between R and K > 2Y, we know that there are no reads left in the original stream that could be moved before R.
|
||||
*
|
||||
* Now, we also need to be certain if we have a mate pair M, that won't emit R before we can incorporate any move of
|
||||
* M into the mate pair info R. There are two cases to consider here:
|
||||
*
|
||||
* If ISIZE > X, we know that we won't move M when we see it, so we can safely emit R knowing that
|
||||
* M is fixed in place.
|
||||
*
|
||||
* If ISIZE <= X, M might be moved, and it we have to wait until we see M in the stream to know it's position.
|
||||
* So R must be buffered until either M arrives, or we see a read K that's more than 2Y units past the original position
|
||||
* of M.
|
||||
*
|
||||
* So the worst-case memory consumption here is proportional to the number of reads
|
||||
* occurring between R and M + 2 Y, and so is proportional to the depth of the data and X and Y.
|
||||
*
|
||||
* This leads to the following simple algorithm:
|
||||
*
|
||||
* addAlignment(newRead):
|
||||
* addReadToListOfReads(newRead)
|
||||
* update mate pair of newRead if present in list of reads
|
||||
*
|
||||
* for ( read in list of reads [in order of increasing read.pos] ):
|
||||
* if read.pos < newRead.pos - 2Y && (read.isize >= X || read.matePos < newRead.pos - 2 * Y):
|
||||
* emit read and remove from list of reads
|
||||
* else:
|
||||
* break
|
||||
*
|
||||
* @author depristo, ebanks
|
||||
* @version 0.2
|
||||
*/
|
||||
public class ConstrainedMateFixingManager {
|
||||
protected static final Logger logger = Logger.getLogger(ConstrainedMateFixingManager.class);
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
/** How often do we check whether we want to emit reads? */
|
||||
private final static int EMIT_FREQUENCY = 1000;
|
||||
|
||||
/**
|
||||
* How much could a single read move in position from its original position?
|
||||
*/
|
||||
final int MAX_POS_MOVE_ALLOWED;
|
||||
|
||||
/**
|
||||
* How many reads should we store in memory before flushing the queue?
|
||||
*/
|
||||
final int MAX_RECORDS_IN_MEMORY;
|
||||
|
||||
/** how we order our SAM records */
|
||||
private final SAMRecordComparator comparer = new SAMRecordCoordinateComparator();
|
||||
|
||||
/** The place where we ultimately write out our records */
|
||||
final SAMFileWriter writer;
|
||||
|
||||
/**
|
||||
* what is the maximum isize of a pair of reads that can move? Reads with isize > this value
|
||||
* are assumes to not be allowed to move in the incoming read stream.
|
||||
*/
|
||||
final int maxInsertSizeForMovingReadPairs;
|
||||
|
||||
final GenomeLocParser genomeLocParser;
|
||||
private GenomeLoc lastLocFlushed = null;
|
||||
|
||||
int counter = 0;
|
||||
|
||||
/** read.name -> records */
|
||||
HashMap<String, SAMRecord> forMateMatching = new HashMap<String, SAMRecord>();
|
||||
Queue<SAMRecord> waitingReads = new PriorityQueue<SAMRecord>(1000, comparer);
|
||||
|
||||
//private SimpleTimer timer = new SimpleTimer("ConstrainedWriter");
|
||||
//private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds
|
||||
//private long lastProgressPrintTime = -1; // When was the last time we printed progress log?
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param writer actual writer
|
||||
* @param genomeLocParser the GenomeLocParser object
|
||||
* @param maxInsertSizeForMovingReadPairs max insert size allowed for moving pairs
|
||||
* @param maxMoveAllowed max positional move allowed for any read
|
||||
* @param maxRecordsInMemory max records to keep in memory
|
||||
*/
|
||||
public ConstrainedMateFixingManager(final SAMFileWriter writer,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final int maxInsertSizeForMovingReadPairs,
|
||||
final int maxMoveAllowed,
|
||||
final int maxRecordsInMemory) {
|
||||
this.writer = writer;
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.maxInsertSizeForMovingReadPairs = maxInsertSizeForMovingReadPairs;
|
||||
this.MAX_POS_MOVE_ALLOWED = maxMoveAllowed;
|
||||
this.MAX_RECORDS_IN_MEMORY = maxRecordsInMemory;
|
||||
|
||||
//timer.start();
|
||||
//lastProgressPrintTime = timer.currentTime();
|
||||
}
|
||||
|
||||
public int getNReadsInQueue() { return waitingReads.size(); }
|
||||
|
||||
public boolean canMoveReads(GenomeLoc earliestPosition) {
|
||||
if ( DEBUG ) logger.info("Refusing to realign? " + earliestPosition + " vs. " + lastLocFlushed);
|
||||
|
||||
return lastLocFlushed == null ||
|
||||
lastLocFlushed.compareContigs(earliestPosition) != 0 ||
|
||||
lastLocFlushed.distance(earliestPosition) > maxInsertSizeForMovingReadPairs;
|
||||
}
|
||||
|
||||
private boolean noReadCanMoveBefore(int pos, SAMRecord addedRead) {
|
||||
return pos + 2 * MAX_POS_MOVE_ALLOWED < addedRead.getAlignmentStart();
|
||||
}
|
||||
|
||||
private void writeRead(SAMRecord read) {
|
||||
try {
|
||||
writer.addAlignment(read);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new UserException("If the maximum allowable reads in memory is too small, it may cause reads to be written out of order when trying to write the BAM; please see the --maxReadsInMemory argument for details. " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public void addRead( SAMRecord newRead ) {
|
||||
if ( DEBUG ) logger.info("New read pos " + newRead.getAlignmentStart() + " OP = " + newRead.getAttribute("OP"));
|
||||
|
||||
//final long curTime = timer.currentTime();
|
||||
//if ( curTime - lastProgressPrintTime > PROGRESS_PRINT_FREQUENCY ) {
|
||||
// lastProgressPrintTime = curTime;
|
||||
// System.out.println("WaitingReads.size = " + waitingReads.size() + ", forMateMatching.size = " + forMateMatching.size());
|
||||
//}
|
||||
|
||||
// if the new read is on a different contig or we have too many reads, then we need to flush the queue and clear the map
|
||||
boolean tooManyReads = getNReadsInQueue() >= MAX_RECORDS_IN_MEMORY;
|
||||
if ( tooManyReads || (getNReadsInQueue() > 0 && waitingReads.peek().getReferenceIndex() != newRead.getReferenceIndex()) ) {
|
||||
if ( DEBUG ) logger.warn("Flushing queue on " + (tooManyReads ? "too many reads" : ("move to new contig: " + newRead.getReferenceName())) + " at " + newRead.getAlignmentStart());
|
||||
|
||||
while ( getNReadsInQueue() > 1 ) {
|
||||
// emit to disk
|
||||
writeRead(waitingReads.remove());
|
||||
}
|
||||
|
||||
lastLocFlushed = genomeLocParser.createGenomeLoc(waitingReads.peek());
|
||||
writeRead(waitingReads.remove());
|
||||
|
||||
if ( !tooManyReads )
|
||||
forMateMatching.clear();
|
||||
}
|
||||
|
||||
// fix mates, as needed
|
||||
// Since setMateInfo can move reads, we potentially need to remove the mate, and requeue
|
||||
// it to ensure proper sorting
|
||||
if ( newRead.getReadPairedFlag() ) {
|
||||
SAMRecord mate = forMateMatching.get(newRead.getReadName());
|
||||
if ( mate != null ) {
|
||||
// Frustratingly, Picard's setMateInfo() method unaligns (by setting the reference contig
|
||||
// to '*') read pairs when both of their flags have the unmapped bit set. This is problematic
|
||||
// when trying to emit reads in coordinate order because all of a sudden we have reads in the
|
||||
// middle of the bam file that now belong at the end - and any mapped reads that get emitted
|
||||
// after them trigger an exception in the writer. For our purposes, because we shouldn't be
|
||||
// moving read pairs when they are both unmapped anyways, we'll just not run fix mates on them.
|
||||
boolean doNotFixMates = newRead.getReadUnmappedFlag() && mate.getReadUnmappedFlag();
|
||||
if ( !doNotFixMates ) {
|
||||
|
||||
boolean reQueueMate = mate.getReadUnmappedFlag() && ! newRead.getReadUnmappedFlag();
|
||||
if ( reQueueMate ) {
|
||||
// the mate was unmapped, but newRead was mapped, so the mate may have been moved
|
||||
// to be next-to newRead, so needs to be reinserted into the waitingReads queue
|
||||
// note -- this must be called before the setMateInfo call below
|
||||
if ( ! waitingReads.remove(mate) )
|
||||
// we must have hit a region with too much depth and flushed the queue
|
||||
reQueueMate = false;
|
||||
}
|
||||
|
||||
// we've already seen our mate -- set the mate info and remove it from the map
|
||||
SamPairUtil.setMateInfo(mate, newRead, null);
|
||||
if ( reQueueMate ) waitingReads.add(mate);
|
||||
}
|
||||
|
||||
forMateMatching.remove(newRead.getReadName());
|
||||
} else {
|
||||
forMateMatching.put(newRead.getReadName(), newRead);
|
||||
}
|
||||
}
|
||||
|
||||
waitingReads.add(newRead);
|
||||
|
||||
if ( ++counter % EMIT_FREQUENCY == 0 ) {
|
||||
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
|
||||
SAMRecord read = waitingReads.peek();
|
||||
|
||||
if ( noReadCanMoveBefore(read.getAlignmentStart(), newRead) &&
|
||||
(iSizeTooBigToMove(read) // we won't try to move such a read
|
||||
|| ! read.getReadPairedFlag() // we're not a paired read
|
||||
|| read.getReadUnmappedFlag() && read.getMateUnmappedFlag() // both reads are unmapped
|
||||
|| noReadCanMoveBefore(read.getMateAlignmentStart(), newRead ) ) ) { // we're already past where the mate started
|
||||
|
||||
// remove reads from the map that we have emitted -- useful for case where the mate never showed up
|
||||
forMateMatching.remove(read.getReadName());
|
||||
|
||||
if ( DEBUG )
|
||||
logger.warn(String.format("EMIT! At %d: read %s at %d with isize %d, mate start %d, op = %s",
|
||||
newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(),
|
||||
read.getInferredInsertSize(), read.getMateAlignmentStart(), read.getAttribute("OP")));
|
||||
// emit to disk
|
||||
writeRead(waitingReads.remove());
|
||||
} else {
|
||||
if ( DEBUG )
|
||||
logger.warn(String.format("At %d: read %s at %d with isize %d couldn't be emited, mate start %d",
|
||||
newRead.getAlignmentStart(), read.getReadName(), read.getAlignmentStart(), read.getInferredInsertSize(), read.getMateAlignmentStart()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( DEBUG ) logger.warn(String.format("At %d: Done with emit cycle", newRead.getAlignmentStart()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param read the read
|
||||
* @return true if the read shouldn't be moved given the constraints of this SAMFileWriter
|
||||
*/
|
||||
public boolean iSizeTooBigToMove(SAMRecord read) {
|
||||
return iSizeTooBigToMove(read, maxInsertSizeForMovingReadPairs); // we won't try to move such a read
|
||||
}
|
||||
|
||||
public static boolean iSizeTooBigToMove(SAMRecord read, int maxInsertSizeForMovingReadPairs) {
|
||||
return ( read.getReadPairedFlag() && ! read.getMateUnmappedFlag() && read.getReferenceName() != read.getMateReferenceName() ) // maps to different chromosomes
|
||||
|| Math.abs(read.getInferredInsertSize()) > maxInsertSizeForMovingReadPairs; // we won't try to move such a read
|
||||
}
|
||||
|
||||
public void close() {
|
||||
// write out all of the remaining reads
|
||||
while ( ! waitingReads.isEmpty() ) { // there's something in the queue
|
||||
writeRead(waitingReads.remove());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.baq.BAQ;
|
|||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||
import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator;
|
||||
import org.broadinstitute.sting.utils.sam.ConstrainedMateFixingSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
|
@ -85,8 +84,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
|
||||
protected double MISMATCH_THRESHOLD = 0.15;
|
||||
|
||||
@Output(required=false, doc="Output bam")
|
||||
@Output(required=true, doc="Output bam")
|
||||
protected StingSAMFileWriter writer = null;
|
||||
protected ConstrainedMateFixingManager manager = null;
|
||||
|
||||
@Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", required=false, doc="Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.")
|
||||
protected boolean USE_KNOWN_INDELS_ONLY = false;
|
||||
|
|
@ -98,6 +98,10 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
// ADVANCED OPTIONS FOLLOW
|
||||
|
||||
@Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
|
||||
"Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false)
|
||||
protected int MAX_RECORDS_IN_MEMORY = 200000;
|
||||
|
||||
@Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false)
|
||||
protected int MAX_ISIZE_FOR_MOVEMENT = 3000;
|
||||
|
||||
|
|
@ -149,10 +153,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
// DEPRECATED
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="This argument is no longer used.", required=false)
|
||||
protected int DEPRECATED_MAX_IN_RAM = 0;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", doc="This argument is no longer used.", required=false)
|
||||
protected boolean DEPRECATED_SORT_IN_COORDINATE_ORDER = false;
|
||||
|
|
@ -210,12 +210,13 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
private FileWriter statsOutput = null;
|
||||
private FileWriter snpsOutput = null;
|
||||
|
||||
protected Map<SAMReaderID,SAMFileWriter> nwayWriters = null;
|
||||
protected Map<SAMReaderID, ConstrainedMateFixingManager> nwayWriters = null;
|
||||
|
||||
// debug info for lazy SW evaluation:
|
||||
private long exactMatchesFound = 0; // how many reads exactly matched a consensus we already had
|
||||
private long SWalignmentRuns = 0; // how many times (=for how many reads) we ran SW alignment
|
||||
private long SWalignmentSuccess = 0; // how many SW alignments were "successful" (i.e. found a workable indel and resulted in non-null consensus)
|
||||
|
||||
public void initialize() {
|
||||
|
||||
if ( LOD_THRESHOLD < 0.0 )
|
||||
|
|
@ -255,9 +256,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
if ( N_WAY_OUT != null ) {
|
||||
|
||||
if ( writer != null ) throw new UserException.BadInput("-nWayOut and -o arguments are mutually exclusive");
|
||||
//if ( writer != null ) throw new UserException.BadInput("-nWayOut and -o arguments are mutually exclusive");
|
||||
|
||||
nwayWriters = new HashMap<SAMReaderID,SAMFileWriter>();
|
||||
nwayWriters = new HashMap<SAMReaderID, ConstrainedMateFixingManager>();
|
||||
|
||||
Map<String,String> fname_map = null;
|
||||
|
||||
|
|
@ -316,15 +317,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
File f = new File(outName);
|
||||
SAMFileHeader header = getToolkit().getSAMFileHeader(rid);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, f);
|
||||
nwayWriters.put(rid,sw);
|
||||
SAMFileWriter sw = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, f);
|
||||
nwayWriters.put(rid, new ConstrainedMateFixingManager(sw, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// set up the output writer(s)
|
||||
if ( writer != null )
|
||||
setupWriter(getToolkit().getSAMFileHeader());
|
||||
// set up the output writer
|
||||
setupWriter(getToolkit().getSAMFileHeader());
|
||||
manager = new ConstrainedMateFixingManager(writer, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY);
|
||||
|
||||
if ( OUT_INDELS != null ) {
|
||||
try {
|
||||
|
|
@ -378,41 +379,32 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
writer.writeHeader(header);
|
||||
writer.setPresorted(true);
|
||||
|
||||
writer.setUseConstrainedFileWriter(true);
|
||||
writer.setMaxInsertSizeForMovingReadPairs(MAX_ISIZE_FOR_MOVEMENT);
|
||||
writer.setMaxPositionalMoveAllowed(MAX_POS_MOVE_ALLOWED);
|
||||
}
|
||||
|
||||
private void emit(final SAMRecord read) {
|
||||
try {
|
||||
if ( writer != null )
|
||||
writer.addAlignment(read);
|
||||
else if ( N_WAY_OUT != null ) {
|
||||
if ( N_WAY_OUT != null ) {
|
||||
SAMReaderID rid = getToolkit().getReaderIDForRead(read);
|
||||
SAMFileWriter w = nwayWriters.get(rid);
|
||||
ConstrainedMateFixingManager m = nwayWriters.get(rid);
|
||||
// reset read's read group from merged to original if read group id collision has happened in merging:
|
||||
if ( getToolkit().getReadsDataSource().hasReadGroupCollisions() ) {
|
||||
read.setAttribute("RG",
|
||||
getToolkit().getReadsDataSource().getOriginalReadGroupId((String)read.getAttribute("RG")));
|
||||
}
|
||||
w.addAlignment(read);
|
||||
m.addRead(read);
|
||||
} else {
|
||||
manager.addRead(read);
|
||||
}
|
||||
} catch (RuntimeIOException e) {
|
||||
throw new UserException.ErrorWritingBamFile(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private void emit(final Collection<SAMRecord> reads) {
|
||||
for ( SAMRecord read : reads )
|
||||
emit(read);
|
||||
}
|
||||
|
||||
private void emitReadLists() {
|
||||
// pre-merge lists with priority queue for constrained SAMFileWriter
|
||||
//logger.warn("EMIT currentInterval " + currentInterval);
|
||||
readsNotToClean.addAll(readsToClean.getReads());
|
||||
emit(ReadUtils.coordinateSortReads(readsNotToClean));
|
||||
for ( SAMRecord read : ReadUtils.coordinateSortReads(readsNotToClean) )
|
||||
emit(read);
|
||||
readsToClean.clear();
|
||||
readsNotToClean.clear();
|
||||
}
|
||||
|
|
@ -475,11 +467,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
read.getReadFailsVendorQualityCheckFlag() ||
|
||||
read.getMappingQuality() == 0 ||
|
||||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
|
||||
ConstrainedMateFixingSAMFileWriter.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT);
|
||||
ConstrainedMateFixingManager.iSizeTooBigToMove(read, MAX_ISIZE_FOR_MOVEMENT);
|
||||
}
|
||||
|
||||
private void cleanAndCallMap(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) {
|
||||
clean(readsToClean);
|
||||
if ( readsToClean.size() > 0 ) {
|
||||
GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0));
|
||||
if ( manager.canMoveReads(earliestPossibleMove) )
|
||||
clean(readsToClean);
|
||||
}
|
||||
knownIndelsToTry.clear();
|
||||
indelRodsSeen.clear();
|
||||
|
||||
|
|
@ -507,14 +503,18 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( readsToClean.size() > 0 || readsNotToClean.size() > 0 ) {
|
||||
clean(readsToClean);
|
||||
knownIndelsToTry.clear();
|
||||
indelRodsSeen.clear();
|
||||
|
||||
if ( readsToClean.size() > 0 ) {
|
||||
GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0));
|
||||
if ( manager.canMoveReads(earliestPossibleMove) )
|
||||
clean(readsToClean);
|
||||
emitReadLists();
|
||||
} else if ( readsNotToClean.size() > 0 ) {
|
||||
emitReadLists();
|
||||
}
|
||||
|
||||
knownIndelsToTry.clear();
|
||||
indelRodsSeen.clear();
|
||||
|
||||
if ( OUT_INDELS != null ) {
|
||||
try {
|
||||
indelOutput.close();
|
||||
|
|
@ -539,12 +539,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
|
||||
if ( N_WAY_OUT != null ) {
|
||||
try {
|
||||
for ( SAMFileWriter w : nwayWriters.values() )
|
||||
w.close();
|
||||
for ( ConstrainedMateFixingManager m : nwayWriters.values() )
|
||||
m.close();
|
||||
} catch (RuntimeIOException e) {
|
||||
throw new UserException.ErrorWritingBamFile(e.getMessage());
|
||||
}
|
||||
} else {
|
||||
manager.close();
|
||||
}
|
||||
|
||||
if ( CHECKEARLY ) {
|
||||
logger.info("SW alignments runs: "+SWalignmentRuns);
|
||||
logger.info("SW alignments successfull: "+SWalignmentSuccess + " ("+SWalignmentSuccess/SWalignmentRuns+"% of SW runs)");
|
||||
|
|
@ -601,7 +604,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
return;
|
||||
|
||||
byte[] reference = readsToClean.getReference(referenceReader);
|
||||
int leftmostIndex = (int)readsToClean.getLocation().getStart();
|
||||
int leftmostIndex = readsToClean.getLocation().getStart();
|
||||
|
||||
final ArrayList<SAMRecord> refReads = new ArrayList<SAMRecord>(); // reads that perfectly match ref
|
||||
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
|
||||
|
|
|
|||
|
|
@ -29,9 +29,7 @@ import net.sf.samtools.CigarOperator;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -48,58 +46,10 @@ public class AlignmentUtils {
|
|||
public long mismatchQualities = 0;
|
||||
}
|
||||
|
||||
/** Returns number of mismatches in the alignment <code>r</code> to the reference sequence
|
||||
* <code>refSeq</code> assuming the alignment starts at (ZERO-based) position <code>refIndex</code> on the
|
||||
* specified reference sequence; in other words, <code>refIndex</code> is used in place of alignment's own
|
||||
* getAlignmentStart() coordinate and the latter is never used. However, the structure of the alignment <code>r</code>
|
||||
* (i.e. it's cigar string with all the insertions/deletions it may specify) is fully respected.
|
||||
*
|
||||
* THIS CODE ASSUMES THAT ALL BYTES COME FROM UPPERCASED CHARS.
|
||||
*
|
||||
* @param r alignment
|
||||
* @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of
|
||||
* the reference string, IndexOutOfBound exception will be thrown at runtime).
|
||||
* @param refIndex zero-based position, at which the alignment starts on the specified reference string.
|
||||
* @return the number of mismatches
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
return getMismatchCount(r, refSeq, refIndex).numMismatches;
|
||||
}
|
||||
|
||||
/** Same as #numMismatches(SAMRecord, byte[], refIndex), but counts mismatches only along the partial stretch
|
||||
* on the read of length <code>nReadBases</code> starting at (0-based) position <code>readIndex</code>.
|
||||
* @param r Aligned read to count mismatches for
|
||||
* @param refSeq Chunk of reference sequence that subsumes the alignment
|
||||
* @param refIndex Zero-based position on <code>refSeq</code> where the alignment for the whole read starts
|
||||
* @param readIndex Zero-based position on the read, the mismatches will be counted only from this position on
|
||||
* @param nReadBases Length of continuous stretch on the read, along which mismatches will be counted
|
||||
* @return
|
||||
*/
|
||||
public static int numMismatches(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).numMismatches;
|
||||
}
|
||||
|
||||
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex, int readIndex, int nReadBases) {
|
||||
return getMismatchCount(r, refSeq, refIndex,readIndex,nReadBases).mismatchQualities;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public static int numMismatches(SAMRecord r, String refSeq, int refIndex ) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||
}
|
||||
|
||||
public static long mismatchingQualities(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
return getMismatchCount(r, refSeq, refIndex).mismatchQualities;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public static long mismatchingQualities(SAMRecord r, String refSeq, int refIndex ) {
|
||||
if ( r.getReadUnmappedFlag() ) return 1000000;
|
||||
return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex);
|
||||
}
|
||||
|
||||
public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) {
|
||||
return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength());
|
||||
}
|
||||
|
|
@ -191,8 +141,8 @@ public class AlignmentUtils {
|
|||
public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite, boolean qualitySumInsteadOfMismatchCount) {
|
||||
int sum = 0;
|
||||
|
||||
int windowStart = (int)ref.getWindow().getStart();
|
||||
int windowStop = (int)ref.getWindow().getStop();
|
||||
int windowStart = ref.getWindow().getStart();
|
||||
int windowStop = ref.getWindow().getStop();
|
||||
byte[] refBases = ref.getBases();
|
||||
byte[] readBases = p.getRead().getReadBases();
|
||||
byte[] readQualities = p.getRead().getBaseQualities();
|
||||
|
|
@ -262,11 +212,11 @@ public class AlignmentUtils {
|
|||
// it's possible we aren't starting at the beginning of a read,
|
||||
// and we don't need to look at any of the previous context outside our window
|
||||
// (although we do need future context)
|
||||
int readStartPos = Math.max(read.getAlignmentStart(), (int)ref.getLocus().getStart() - windowSize);
|
||||
int readStartPos = Math.max(read.getAlignmentStart(), ref.getLocus().getStart() - windowSize);
|
||||
int currentReadPos = read.getAlignmentStart();
|
||||
|
||||
byte[] refBases = ref.getBases();
|
||||
int refIndex = readStartPos - (int)ref.getWindow().getStart();
|
||||
int refIndex = readStartPos - ref.getWindow().getStart();
|
||||
if ( refIndex < 0 ) {
|
||||
throw new IllegalStateException("When calculating mismatches, we somehow don't have enough previous reference context for read " + read.getReadName() + " at position " + ref.getLocus());
|
||||
}
|
||||
|
|
@ -381,46 +331,6 @@ public class AlignmentUtils {
|
|||
return n;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) {
|
||||
|
||||
final char[] alignment = new char[read.length];
|
||||
int refPos = 0;
|
||||
int alignPos = 0;
|
||||
|
||||
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
|
||||
|
||||
final CigarElement ce = cigar.getCigarElement(iii);
|
||||
final int elementLength = ce.getLength();
|
||||
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
case S:
|
||||
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
|
||||
alignment[alignPos++] = '+';
|
||||
}
|
||||
break;
|
||||
case D:
|
||||
case N:
|
||||
refPos++;
|
||||
break;
|
||||
case M:
|
||||
for ( int jjj = 0 ; jjj < elementLength; jjj++ ) {
|
||||
alignment[alignPos] = ref[refPos];
|
||||
alignPos++;
|
||||
refPos++;
|
||||
}
|
||||
break;
|
||||
case H:
|
||||
case P:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
return alignment;
|
||||
}
|
||||
|
||||
public static byte[] alignmentToByteArray( final Cigar cigar, final byte[] read, final byte[] ref ) {
|
||||
|
||||
final byte[] alignment = new byte[read.length];
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -101,4 +102,28 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("be859f9a98d738becee0526887cae42e"));
|
||||
executeTest("realigner long run", spec);
|
||||
}
|
||||
|
||||
@Test(dependsOnMethods = { "testStats" })
|
||||
public void testMaxReadsInMemory() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "--maxReadsInMemory 10000", "424271ca000442e8b338c67d95dadb82" );
|
||||
e.put( "--maxReadsInMemory 40000", base_md5 );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + entry.getKey(),
|
||||
1,
|
||||
Arrays.asList(entry.getValue()));
|
||||
executeTest(String.format("realigner [%s]", entry.getKey()), spec);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = { RuntimeException.class }, dependsOnMethods = { "testMaxReadsInMemory" })
|
||||
public void testFailure() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + "--maxReadsInMemory 1000",
|
||||
1,
|
||||
Arrays.asList(""));
|
||||
executeTest("realigner failure", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest {
|
|||
" -maxConsensuses 100" +
|
||||
" -greedy 100" +
|
||||
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
|
||||
" -o /dev/null" +
|
||||
" -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" +
|
||||
" -L chr1:1-5,650,000" +
|
||||
" -targetIntervals " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.realigner.intervals",
|
||||
|
|
@ -45,6 +46,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest {
|
|||
" -maxConsensuses 100" +
|
||||
" -greedy 100" +
|
||||
" -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" +
|
||||
" -o /dev/null" +
|
||||
" -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" +
|
||||
" -L chr1:1-150,000,000" +
|
||||
" -targetIntervals " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.realigner.intervals",
|
||||
|
|
|
|||
Loading…
Reference in New Issue