diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index bfb6781fe..877bfec9f 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -29,12 +29,11 @@ import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import java.util.*; @@ -54,6 +53,7 @@ public class LocusIteratorByState extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- private final PushbackIterator it; + private boolean hasExtendedEvents = false; // will be set to true if at least one read had an indel right before the current position private class SAMRecordState { SAMRecord read; @@ -68,14 +68,40 @@ public class LocusIteratorByState extends LocusIterator { // how far are we into a single cigarElement int cigarElementCounter = -1; - public SAMRecordState(SAMRecord read) { + // The logical model for generating extended events is as follows: the "record state" implements the traversal + // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This + // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the + // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or + // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from + // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended + // events immediately preceding the current reference base). + + boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? + // the only purpose of this flag is to shield away a few additional lines of code + // when extended piles are not needed, it may not be even worth it... + byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) + int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events + byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the + // current base on the ref. We use a counter-like variable here since clearing the indel event is + // delayed by one base, so we need to remember how long ago we have seen the actual event + int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the + // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, + // we cache it here mainly for convenience + + + public SAMRecordState(SAMRecord read, boolean extended) { this.read = read; cigar = read.getCigar(); nCigarElements = cigar.numCigarElements(); + generateExtendedEvents = extended; //System.out.printf("Creating a SAMRecordState: %s%n", this); } + public SAMRecordState(SAMRecord read) { + this(read,false); + } + public SAMRecord getRead() { return read; } /** @@ -102,11 +128,29 @@ public class LocusIteratorByState extends LocusIterator { return curElement.getOperator(); } - public String toString() { + /** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome. + * + * @return + */ + public boolean hadIndel() { + return ( eventLength > 0 ); + } + + public int getEventLength() { return eventLength; } + + public byte[] getEventBases() { return insertedBases; } + + public int getReadEventStartOffset() { return eventStart; } + + public String toString() { return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); } public CigarOperator stepForwardOnGenome() { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion + + if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) { cigarOffset++; if ( cigarOffset < nCigarElements ) { @@ -116,23 +160,48 @@ public class LocusIteratorByState extends LocusIterator { // we reenter in order to re-check cigarElementCounter against curElement's length return stepForwardOnGenome(); } else { + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. return null; } } + boolean done = false; switch (curElement.getOperator()) { case H : // ignore hard clips case P : // ignore pads cigarElementCounter = curElement.getLength(); break; - case S : // soft clip case I : // insertion w.r.t. the reference + if ( generateExtendedEvents ) { + // we see insertions only once, when we step right onto them; the position on the read is scrolled + // past the insertion right after that + if ( eventDelayedFlag > 1 ) throw new StingException("Adjacent I/D events in read "+read.getReadName()); + insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength()); + eventLength = cigarElementCounter ; + eventStart = readOffset; + eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2 +// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset); + } // continue onto the 'S' case ! + case S : // soft clip cigarElementCounter = curElement.getLength(); readOffset += curElement.getLength(); break; - case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) case D : // deletion w.r.t. the reference + if ( generateExtendedEvents ) { + if ( cigarElementCounter == 1) { + // generate an extended event only if we just stepped into the deletion (i.e. don't + // generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!) + if ( eventDelayedFlag > 1 ) throw new StingException("Adjacent I/D events in read "+read.getReadName()); + eventLength = curElement.getLength(); + eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only + eventStart = readOffset; + insertedBases = null; +// System.out.println("Deleted "+eventLength +" bases after "+readOffset); + } + } // continue onto the 'N' case ! + case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) genomeOffset++; done = true; break; @@ -144,6 +213,21 @@ public class LocusIteratorByState extends LocusIterator { default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); } + if ( generateExtendedEvents ) { + if ( eventDelayedFlag > 0 && done ) { + // if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe, + // the flag is 1, we are standing on the reference base right after the indel (so we have to keep it). + // Otherwise, we are away from the previous indel and have to clear our memories... + eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now + // if eventDelayedFlag == 1, an indel occured right before the current base + if ( eventDelayedFlag == 0 ) { + eventLength = -1; // reset event when we are past it + insertedBases = null; + eventStart = -1; + } + } + } + return done ? curElement.getOperator() : stepForwardOnGenome(); } } @@ -203,36 +287,96 @@ public class LocusIteratorByState extends LocusIterator { public AlignmentContext next() { // keep iterating forward until we encounter a reference position that has something "real" hanging over it // (i.e. either a real base, or a real base or a deletion if includeReadsWithDeletion is true) + + while(true) { - ArrayList pile = new ArrayList(readStates.size()); + + // this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref: collectPendingReads(readInfo.getMaxReadsAtLocus()); int size = 0; int nDeletions = 0; + int nInsertions = 0; int nMQ0Reads = 0; - // todo -- performance problem -- should be lazy, really - for ( SAMRecordState state : readStates ) { - if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { - size++; - PileupElement p = new PileupElement(state.getRead(), state.getReadOffset()); - pile.add(p); - } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { - size++; - pile.add(new PileupElement(state.getRead(), -1)); - nDeletions++; - } - if ( state.getRead().getMappingQuality() == 0 ) { - nMQ0Reads++; + // if extended events are requested, and if previous traversal step brought us over an indel in + // at least one read, we emit extended pileup (making sure that it is associated with the previous base, + // i.e. the one right *before* the indel) and do NOT shift the current position on the ref. + // In this case, the subsequent call to next() will emit the normal pileup at the current base + // and shift the position. + if ( readInfo.generateExtendedEvents() && hasExtendedEvents ) { + ArrayList indelPile = new ArrayList(readStates.size()); + + for ( SAMRecordState state : readStates ) { + if ( state.hadIndel() ) { + size++; + if ( state.getEventBases() == null ) nDeletions++; + else nInsertions++; + indelPile.add ( new ExtendedEventPileupElement(state.getRead(), + state.getReadEventStartOffset(), + state.getEventLength(), + state.getEventBases()) ); + } else { + if ( state.getCurrentCigarOperator() != CigarOperator.N ) { + // this read has no indel associated with the previous position on the ref; + // we count this read in only if it has actual bases, not N span... + if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) { + + // if cigar operator is D but the read has no extended event reported (that's why we ended + // up in this branch), it means that we are currently inside a deletion that started earlier; + // we count such reads (with a longer deletion spanning over a deletion at the previous base we are + // about to report) only if includeReadsWithDeletionAtLoci is true. + size++; + indelPile.add ( new ExtendedEventPileupElement(state.getRead(), + state.getReadOffset()-1, + -1) // length=-1 --> noevent + ); + } + } + } + if ( state.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } } + hasExtendedEvents = false; // we are done with extended events prior to current ref base + SAMRecordState our1stState = readStates.getFirst(); + // get current location on the reference and decrement it by 1: the indels we just stepped over + // are associated with the *previous* reference base + GenomeLoc loc = GenomeLocParser.incPos(our1stState.getLocation(),-1); +// System.out.println("Indel(s) at "+loc); +// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); } + return new AlignmentContext(loc, new ReadBackedExtendedEventPileup(loc, indelPile, size, nInsertions, nDeletions, nMQ0Reads)); + } else { + + ArrayList pile = new ArrayList(readStates.size()); + + + // todo -- performance problem -- should be lazy, really + for ( SAMRecordState state : readStates ) { + if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) { + size++; + PileupElement p = new PileupElement(state.getRead(), state.getReadOffset()); + pile.add(p); + } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { + size++; + pile.add(new PileupElement(state.getRead(), -1)); + nDeletions++; + } + + if ( state.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } + +// if ( state.hadIndel() ) System.out.println("Indel at "+getLocation()+" in read "+state.getRead().getReadName()) ; + + } + GenomeLoc loc = getLocation(); + updateReadStates(); // critical - must be called after we get the current state offsets and location + // if we got reads with non-D/N over the current position, we are done + if ( pile.size() != 0 ) return new AlignmentContext(loc, new ReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads)); } - GenomeLoc loc = getLocation(); - updateReadStates(); // critical - must be called after we get the current state offsets and location - - // if we got reads with non-D/N over the current position, we are done - if ( pile.size() != 0 ) return new AlignmentContext(loc, new ReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads)); } } @@ -300,10 +444,11 @@ public class LocusIteratorByState extends LocusIterator { Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maximumPileupSize + " at location " + getLocation()); } } else { - SAMRecordState state = new SAMRecordState(read); + SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents()); state.stepForwardOnGenome(); readStates.add(state); curSize++; + if ( state.hadIndel() ) hasExtendedEvents = true; //if (DEBUG) logger.debug(String.format(" ... added read %s", read.getReadName())); } } @@ -317,6 +462,13 @@ public class LocusIteratorByState extends LocusIterator { else { SAMRecordState state = readStates.getFirst(); SAMRecord ourRead = state.getRead(); +// int offset = 0; +// final CigarElement ce = read.getCigar().getCigarElement(0); + // if read starts with an insertion, we want to get it in at the moment we are standing on the + // reference base the insertion is associated with, not when we reach "alignment start", which is + // first base *after* the insertion +// if ( ce.getOperator() == CigarOperator.I ) offset = ce.getLength(); +// return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() - offset > state.getGenomePosition(); return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); } } @@ -327,9 +479,15 @@ public class LocusIteratorByState extends LocusIterator { while ( it.hasNext() ) { SAMRecordState state = it.next(); CigarOperator op = state.stepForwardOnGenome(); - if ( op == null ) { // we've stepped off the end of the object - //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); - it.remove(); + if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true; + else { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + if ( op == null ) { // we've stepped off the end of the object + //if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart())); + it.remove(); + } } } }