Modified to enable locus traversals firing additional calls to walker's map() with alignment context filled with extended events (indels). Walker should override generateExtendedEvents() to return true, and it should make sure that it catches those additional indel pileups and processes them differently, as needed. If there are indels associated with a specific reference base, TWO map() calls will be issued in locus traversal at that location: first one will have a context filled with a regular base pileup, the second call will provide the context filled with indel pileup (pileup elements will have insertion, deletion, or noevent type associated with them and will also carry information about the full length of the event and inserted bases).

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2471 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-12-29 20:13:25 +00:00
parent 06eb576924
commit 9652692019
1 changed files with 189 additions and 31 deletions

View File

@ -29,12 +29,11 @@ import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import java.util.*;
@ -54,6 +53,7 @@ public class LocusIteratorByState extends LocusIterator {
//
// -----------------------------------------------------------------------------------------------------------------
private final PushbackIterator<SAMRecord> it;
private boolean hasExtendedEvents = false; // will be set to true if at least one read had an indel right before the current position
private class SAMRecordState {
SAMRecord read;
@ -68,14 +68,40 @@ public class LocusIteratorByState extends LocusIterator {
// how far are we into a single cigarElement
int cigarElementCounter = -1;
public SAMRecordState(SAMRecord read) {
// The logical model for generating extended events is as follows: the "record state" implements the traversal
// along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
// can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
// deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
// if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
// events immediately preceding the current reference base).
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
// the only purpose of this flag is to shield away a few additional lines of code
// when extended piles are not needed, it may not be even worth it...
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
// current base on the ref. We use a counter-like variable here since clearing the indel event is
// delayed by one base, so we need to remember how long ago we have seen the actual event
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
// we cache it here mainly for convenience
public SAMRecordState(SAMRecord read, boolean extended) {
this.read = read;
cigar = read.getCigar();
nCigarElements = cigar.numCigarElements();
generateExtendedEvents = extended;
//System.out.printf("Creating a SAMRecordState: %s%n", this);
}
public SAMRecordState(SAMRecord read) {
this(read,false);
}
public SAMRecord getRead() { return read; }
/**
@ -102,11 +128,29 @@ public class LocusIteratorByState extends LocusIterator {
return curElement.getOperator();
}
public String toString() {
/** Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
*
* @return
*/
public boolean hadIndel() {
return ( eventLength > 0 );
}
public int getEventLength() { return eventLength; }
public byte[] getEventBases() { return insertedBases; }
public int getReadEventStartOffset() { return eventStart; }
public String toString() {
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
}
public CigarOperator stepForwardOnGenome() {
// we enter this method with readOffset = index of the last processed base on the read
// (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
if ( curElement == null || ++cigarElementCounter > curElement.getLength() ) {
cigarOffset++;
if ( cigarOffset < nCigarElements ) {
@ -116,23 +160,48 @@ public class LocusIteratorByState extends LocusIterator {
// we reenter in order to re-check cigarElementCounter against curElement's length
return stepForwardOnGenome();
} else {
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
return null;
}
}
boolean done = false;
switch (curElement.getOperator()) {
case H : // ignore hard clips
case P : // ignore pads
cigarElementCounter = curElement.getLength();
break;
case S : // soft clip
case I : // insertion w.r.t. the reference
if ( generateExtendedEvents ) {
// we see insertions only once, when we step right onto them; the position on the read is scrolled
// past the insertion right after that
if ( eventDelayedFlag > 1 ) throw new StingException("Adjacent I/D events in read "+read.getReadName());
insertedBases = Arrays.copyOfRange(read.getReadBases(),readOffset+1,readOffset+1+curElement.getLength());
eventLength = cigarElementCounter ;
eventStart = readOffset;
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
} // continue onto the 'S' case !
case S : // soft clip
cigarElementCounter = curElement.getLength();
readOffset += curElement.getLength();
break;
case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
case D : // deletion w.r.t. the reference
if ( generateExtendedEvents ) {
if ( cigarElementCounter == 1) {
// generate an extended event only if we just stepped into the deletion (i.e. don't
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
if ( eventDelayedFlag > 1 ) throw new StingException("Adjacent I/D events in read "+read.getReadName());
eventLength = curElement.getLength();
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
eventStart = readOffset;
insertedBases = null;
// System.out.println("Deleted "+eventLength +" bases after "+readOffset);
}
} // continue onto the 'N' case !
case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
genomeOffset++;
done = true;
break;
@ -144,6 +213,21 @@ public class LocusIteratorByState extends LocusIterator {
default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
}
if ( generateExtendedEvents ) {
if ( eventDelayedFlag > 0 && done ) {
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementthe,
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
// Otherwise, we are away from the previous indel and have to clear our memories...
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
// if eventDelayedFlag == 1, an indel occured right before the current base
if ( eventDelayedFlag == 0 ) {
eventLength = -1; // reset event when we are past it
insertedBases = null;
eventStart = -1;
}
}
}
return done ? curElement.getOperator() : stepForwardOnGenome();
}
}
@ -203,36 +287,96 @@ public class LocusIteratorByState extends LocusIterator {
public AlignmentContext next() {
// keep iterating forward until we encounter a reference position that has something "real" hanging over it
// (i.e. either a real base, or a real base or a deletion if includeReadsWithDeletion is true)
while(true) {
ArrayList<PileupElement> pile = new ArrayList<PileupElement>(readStates.size());
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
collectPendingReads(readInfo.getMaxReadsAtLocus());
int size = 0;
int nDeletions = 0;
int nInsertions = 0;
int nMQ0Reads = 0;
// todo -- performance problem -- should be lazy, really
for ( SAMRecordState state : readStates ) {
if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) {
size++;
PileupElement p = new PileupElement(state.getRead(), state.getReadOffset());
pile.add(p);
} else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) {
size++;
pile.add(new PileupElement(state.getRead(), -1));
nDeletions++;
}
if ( state.getRead().getMappingQuality() == 0 ) {
nMQ0Reads++;
// if extended events are requested, and if previous traversal step brought us over an indel in
// at least one read, we emit extended pileup (making sure that it is associated with the previous base,
// i.e. the one right *before* the indel) and do NOT shift the current position on the ref.
// In this case, the subsequent call to next() will emit the normal pileup at the current base
// and shift the position.
if ( readInfo.generateExtendedEvents() && hasExtendedEvents ) {
ArrayList<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size());
for ( SAMRecordState state : readStates ) {
if ( state.hadIndel() ) {
size++;
if ( state.getEventBases() == null ) nDeletions++;
else nInsertions++;
indelPile.add ( new ExtendedEventPileupElement(state.getRead(),
state.getReadEventStartOffset(),
state.getEventLength(),
state.getEventBases()) );
} else {
if ( state.getCurrentCigarOperator() != CigarOperator.N ) {
// this read has no indel associated with the previous position on the ref;
// we count this read in only if it has actual bases, not N span...
if ( state.getCurrentCigarOperator() != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci() ) {
// if cigar operator is D but the read has no extended event reported (that's why we ended
// up in this branch), it means that we are currently inside a deletion that started earlier;
// we count such reads (with a longer deletion spanning over a deletion at the previous base we are
// about to report) only if includeReadsWithDeletionAtLoci is true.
size++;
indelPile.add ( new ExtendedEventPileupElement(state.getRead(),
state.getReadOffset()-1,
-1) // length=-1 --> noevent
);
}
}
}
if ( state.getRead().getMappingQuality() == 0 ) {
nMQ0Reads++;
}
}
hasExtendedEvents = false; // we are done with extended events prior to current ref base
SAMRecordState our1stState = readStates.getFirst();
// get current location on the reference and decrement it by 1: the indels we just stepped over
// are associated with the *previous* reference base
GenomeLoc loc = GenomeLocParser.incPos(our1stState.getLocation(),-1);
// System.out.println("Indel(s) at "+loc);
// for ( ExtendedEventPileupElement pe : indelPile ) { if ( pe.isIndel() ) System.out.println(" "+pe.toString()); }
return new AlignmentContext(loc, new ReadBackedExtendedEventPileup(loc, indelPile, size, nInsertions, nDeletions, nMQ0Reads));
} else {
ArrayList<PileupElement> pile = new ArrayList<PileupElement>(readStates.size());
// todo -- performance problem -- should be lazy, really
for ( SAMRecordState state : readStates ) {
if ( state.getCurrentCigarOperator() != CigarOperator.D && state.getCurrentCigarOperator() != CigarOperator.N ) {
size++;
PileupElement p = new PileupElement(state.getRead(), state.getReadOffset());
pile.add(p);
} else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) {
size++;
pile.add(new PileupElement(state.getRead(), -1));
nDeletions++;
}
if ( state.getRead().getMappingQuality() == 0 ) {
nMQ0Reads++;
}
// if ( state.hadIndel() ) System.out.println("Indel at "+getLocation()+" in read "+state.getRead().getReadName()) ;
}
GenomeLoc loc = getLocation();
updateReadStates(); // critical - must be called after we get the current state offsets and location
// if we got reads with non-D/N over the current position, we are done
if ( pile.size() != 0 ) return new AlignmentContext(loc, new ReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads));
}
GenomeLoc loc = getLocation();
updateReadStates(); // critical - must be called after we get the current state offsets and location
// if we got reads with non-D/N over the current position, we are done
if ( pile.size() != 0 ) return new AlignmentContext(loc, new ReadBackedPileup(loc, pile, size, nDeletions, nMQ0Reads));
}
}
@ -300,10 +444,11 @@ public class LocusIteratorByState extends LocusIterator {
Utils.warnUser("Unable to add a read, we're over the hanger limit of " + maximumPileupSize + " at location " + getLocation());
}
} else {
SAMRecordState state = new SAMRecordState(read);
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
state.stepForwardOnGenome();
readStates.add(state);
curSize++;
if ( state.hadIndel() ) hasExtendedEvents = true;
//if (DEBUG) logger.debug(String.format(" ... added read %s", read.getReadName()));
}
}
@ -317,6 +462,13 @@ public class LocusIteratorByState extends LocusIterator {
else {
SAMRecordState state = readStates.getFirst();
SAMRecord ourRead = state.getRead();
// int offset = 0;
// final CigarElement ce = read.getCigar().getCigarElement(0);
// if read starts with an insertion, we want to get it in at the moment we are standing on the
// reference base the insertion is associated with, not when we reach "alignment start", which is
// first base *after* the insertion
// if ( ce.getOperator() == CigarOperator.I ) offset = ce.getLength();
// return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() - offset > state.getGenomePosition();
return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
}
}
@ -327,9 +479,15 @@ public class LocusIteratorByState extends LocusIterator {
while ( it.hasNext() ) {
SAMRecordState state = it.next();
CigarOperator op = state.stepForwardOnGenome();
if ( op == null ) { // we've stepped off the end of the object
//if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart()));
it.remove();
if ( state.hadIndel() && readInfo.generateExtendedEvents() ) hasExtendedEvents = true;
else {
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
if ( op == null ) { // we've stepped off the end of the object
//if (DEBUG) logger.debug(String.format(" removing read %s at %d", state.getRead().getReadName(), state.getRead().getAlignmentStart()));
it.remove();
}
}
}
}