Pileup element and corresponding container class tweaked for representing pileups of extended events (indels) at a given locus. There's some redundancy with PileupElement and ReadBackedPileup (should we rename them to BasePileupElement and ReadBackedBasePileup?), so that abstracting a basic interface/abstract base from these classes can be considered in the future

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2469 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-12-29 20:03:39 +00:00
parent 87e863b48d
commit f445745c56
2 changed files with 450 additions and 0 deletions

View File

@ -0,0 +1,126 @@
package org.broadinstitute.sting.utils.pileup;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import net.sf.samtools.SAMRecord;
import java.util.Arrays;
import java.util.Collections;
/**
* In the "standard" locus traversal mode,
* the traversal is performed striclty over the reference bases. Thus, only pileups of bases (and hence local events
* such as point mutations) are "seen" at every invocation of the walker's map() function at every (genomic) locus. Deletions
* are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted
* in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept.
* The insertions that may be present in some reads are not seen at all in such strict reference traversal mode.
*
* By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e.
* last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is
* an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice:
* first call will be performed in a "standard" mode, with a pileup of bases over the position Z, and then the additional
* call will be made at the same position with a pileup of event/noevent calls, where events are extended and contain
* full information about insertions/deletions. Then the next, "standard", call to map() will be performed at the next
* (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at
* Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed
* only once, at the position where the indel maps (starts).
*
* This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location.
*
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 21, 2009
* Time: 2:57:55 PM
* To change this template use File | Settings | File Templates.
*/
public class ExtendedEventPileupElement {
public enum Type {
NOEVENT, DELETION, INSERTION
};
private Type type = null;
private int eventLength = -1;
private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases
// in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
private SAMRecord read;
private int offset; // position in the read immediately BEFORE the event
/** Constructor for extended pileup element (indel).
*
* @param read the read, in which the indel is observed
* @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion)
* @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT)
* @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent)
*/
public ExtendedEventPileupElement( SAMRecord read, int offset, int length, byte[] eventBases ) {
this.read = read;
this.offset = offset;
this.eventLength = length;
if ( length <= 0 ) type = Type.NOEVENT;
else {
if ( eventBases != null ) {
this.eventBases = new String(eventBases);
type = Type.INSERTION;
} else {
type = Type.DELETION;
}
}
}
/** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should
* be null or are ignored in these cases anyway)
* @param read
* @param offset
* @param length
*/
public ExtendedEventPileupElement( SAMRecord read, int offset, int length ) {
this(read,offset, length, null);
}
public boolean isDeletion() {
return type == Type.DELETION;
}
public boolean isInsertion() {
return type == Type.INSERTION;
}
public boolean isIndel() {
return isDeletion() || isInsertion();
}
public SAMRecord getRead() { return read; }
public int getOffset() { return offset; }
public int getMappingQual() { return read.getMappingQuality(); }
public Type getType() { return type; }
public GenomeLoc getLocation() {
return GenomeLocParser.createGenomeLoc(read.getReferenceIndex(),read.getAlignmentStart()+offset, read.getAlignmentStart()+offset+eventLength);
}
/** Returns length of the event (number of inserted or deleted bases */
public int getEventLength() { return eventLength; }
/** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
* */
public String getEventBases() { return eventBases; }
@Override
public String toString() {
char c = '.';
String fillStr = null ;
if ( isDeletion() ) {
c = '-';
char [] filler = new char[eventLength];
Arrays.fill(filler, 'D');
fillStr = new String(filler);
}
else if ( isInsertion() ) c = '+';
return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()?
(isInsertion() ? eventBases : fillStr ): "", getMappingQual());
}
}

View File

@ -0,0 +1,324 @@
package org.broadinstitute.sting.utils.pileup;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.gatk.iterators.IterableIterator;
import java.util.*;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Dec 29, 2009
* Time: 12:25:55 PM
* To change this template use File | Settings | File Templates.
*/
public class ReadBackedExtendedEventPileup implements Iterable<ExtendedEventPileupElement> {
private GenomeLoc loc = null;
private ArrayList<ExtendedEventPileupElement> pileup = null;
private int size = 0; // cached value of the size of the pileup
private int nDeletions = 0; // cached value of the number of deletions
private int nInsertions = 0;
private int nMQ0Reads = 0; // cached value of the number of MQ0 reads
/**
* Create a new version of a read backed pileup at loc without any aligned reads
*
*/
public ReadBackedExtendedEventPileup(GenomeLoc loc ) {
this(loc, new ArrayList<ExtendedEventPileupElement>(0));
}
/**
* Create a new version of a read backed pileup at loc, using the reads and their corresponding
* offsets. This lower level constructure assumes pileup is well-formed and merely keeps a
* pointer to pileup. Don't go changing the data in pileup.
*
*/
public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList<ExtendedEventPileupElement> pileup ) {
if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup");
if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup");
this.loc = loc;
this.pileup = pileup;
calculatedCachedData();
}
/**
* Optimization of above constructor where all of the cached data is provided
* @param loc
* @param pileup
*/
public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList<ExtendedEventPileupElement> pileup, int size, int nInsertions, int nDeletions, int nMQ0Reads ) {
if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup");
if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup");
this.loc = loc;
this.pileup = pileup;
this.size = size;
this.nDeletions = nDeletions;
this.nInsertions = nInsertions;
this.nMQ0Reads = nMQ0Reads;
}
/**
* Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront,
* so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling
* sizes, nDeletion, etc. over and over potentially.
*/
private void calculatedCachedData() {
size = 0;
nDeletions = 0;
nInsertions = 0;
nMQ0Reads = 0;
for ( ExtendedEventPileupElement p : this ) {
size++;
if ( p.isDeletion() ) {
nDeletions++;
} else {
if ( p.isInsertion() ) nInsertions++;
}
if ( p.getRead().getMappingQuality() == 0 ) {
nMQ0Reads++;
}
}
}
// --------------------------------------------------------
//
// Special 'constructors'
//
// --------------------------------------------------------
/**
* Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this
* does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy
* of the pileup (just returns this) if there are no MQ0 reads in the pileup.
*
* @return
*/
public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() {
if ( getNumberOfMappingQualityZeroReads() > 0 ) {
ArrayList<ExtendedEventPileupElement> filteredPileup = new ArrayList<ExtendedEventPileupElement>();
for ( ExtendedEventPileupElement p : pileup ) {
if ( p.getRead().getMappingQuality() > 0 ) {
filteredPileup.add(p);
}
}
return new ReadBackedExtendedEventPileup(loc, filteredPileup);
} else {
return this;
}
}
public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) {
ArrayList<ExtendedEventPileupElement> filteredPileup = new ArrayList<ExtendedEventPileupElement>();
for ( ExtendedEventPileupElement p : pileup ) {
if ( p.getRead().getMappingQuality() >= minMapQ ) {
filteredPileup.add(p);
}
}
return new ReadBackedExtendedEventPileup(loc, filteredPileup);
}
/**
* Returns a pileup randomly downsampled to the desiredCoverage.
*
* @param desiredCoverage
* @return
*/
public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) {
if ( size() <= desiredCoverage )
return this;
// randomly choose numbers corresponding to positions in the reads list
Random generator = new Random();
TreeSet<Integer> positions = new TreeSet<Integer>();
for ( int i = 0; i < desiredCoverage; /* no update */ ) {
if ( positions.add(generator.nextInt(pileup.size())) )
i++;
}
Iterator positionIter = positions.iterator();
ArrayList<ExtendedEventPileupElement> downsampledPileup = new ArrayList<ExtendedEventPileupElement>();
while ( positionIter.hasNext() ) {
int nextReadToKeep = (Integer)positionIter.next();
downsampledPileup.add(pileup.get(nextReadToKeep));
}
return new ReadBackedExtendedEventPileup(getLocation(), downsampledPileup);
}
// --------------------------------------------------------
//
// iterators
//
// --------------------------------------------------------
/**
* The best way to access PileupElements where you only care about the bases and quals in the pileup.
*
* for (PileupElement p : this) { doSomething(p); }
*
* Provides efficient iteration of the data.
*
* @return
*/
public Iterator<ExtendedEventPileupElement> iterator() {
return pileup.iterator();
}
/**
* Returns the number of deletion events in this pileup
*
* @return
*/
public int getNumberOfDeletions() {
return nDeletions;
}
/**
* Returns the number of insertion events in this pileup
*
* @return
*/
public int getNumberOfInsertions() {
return nInsertions;
}
/**
* Returns the number of mapping quality zero reads in this pileup.
* @return
*/
public int getNumberOfMappingQualityZeroReads() {
return nMQ0Reads;
}
// public int getNumberOfDeletions() {
// int n = 0;
//
// for ( int i = 0; i < size(); i++ ) {
// if ( getOffsets().get(i) != -1 ) { n++; }
// }
//
// return n;
// }
/**
* @return the number of elements in this pileup
*/
public int size() {
return size;
}
/**
* @return the location of this pileup
*/
public GenomeLoc getLocation() {
return loc;
}
public String getShortPileupString() {
// In the pileup format, each extended event line has genomic position (chromosome name and offset),
// reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing
// insertion, deletion or no-event, respectively.
return String.format("%s %s E %s",
getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate
new String(getEvents()) );
}
// --------------------------------------------------------
//
// Convenience functions that may be slow
//
// --------------------------------------------------------
/**
* Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time
* @return
*/
public List<SAMRecord> getReads() {
List<SAMRecord> reads = new ArrayList<SAMRecord>(size());
for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); }
return reads;
}
/**
* Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time
* @return
*/
public List<Integer> getOffsets() {
List<Integer> offsets = new ArrayList<Integer>(size());
for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); }
return offsets;
}
/**
* Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time
* @return
*/
public byte[] getEvents() {
byte[] v = new byte[size()];
int i = 0;
for ( ExtendedEventPileupElement e : this ) {
switch ( e.getType() ) {
case INSERTION: v[i] = 'I'; break;
case DELETION: v[i] = 'D'; break;
case NOEVENT: v[i] = '.'; break;
default: throw new StingException("Unknown event type encountered: "+e.getType());
}
i++;
}
return v;
}
/**
* Get an array of the mapping qualities
* @return
*/
public byte[] getMappingQuals() {
byte[] v = new byte[size()];
int i = 0;
for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); }
return v;
}
//
// Private functions for printing pileups
//
private String getMappingQualsString() {
return quals2String(getMappingQuals());
}
private static String quals2String( byte[] quals ) {
StringBuilder qualStr = new StringBuilder();
for ( int qual : quals ) {
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
qualStr.append(qualChar);
}
return qualStr.toString();
}
}