Pileup element and corresponding container class tweaked for representing pileups of extended events (indels) at a given locus. There's some redundancy with PileupElement and ReadBackedPileup (should we rename them to BasePileupElement and ReadBackedBasePileup?), so that abstracting a basic interface/abstract base from these classes can be considered in the future
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2469 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
87e863b48d
commit
f445745c56
|
|
@ -0,0 +1,126 @@
|
|||
package org.broadinstitute.sting.utils.pileup;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* In the "standard" locus traversal mode,
|
||||
* the traversal is performed striclty over the reference bases. Thus, only pileups of bases (and hence local events
|
||||
* such as point mutations) are "seen" at every invocation of the walker's map() function at every (genomic) locus. Deletions
|
||||
* are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted
|
||||
* in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept.
|
||||
* The insertions that may be present in some reads are not seen at all in such strict reference traversal mode.
|
||||
*
|
||||
* By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e.
|
||||
* last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is
|
||||
* an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice:
|
||||
* first call will be performed in a "standard" mode, with a pileup of bases over the position Z, and then the additional
|
||||
* call will be made at the same position with a pileup of event/noevent calls, where events are extended and contain
|
||||
* full information about insertions/deletions. Then the next, "standard", call to map() will be performed at the next
|
||||
* (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at
|
||||
* Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed
|
||||
* only once, at the position where the indel maps (starts).
|
||||
*
|
||||
* This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location.
|
||||
*
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Dec 21, 2009
|
||||
* Time: 2:57:55 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ExtendedEventPileupElement {
|
||||
public enum Type {
|
||||
NOEVENT, DELETION, INSERTION
|
||||
};
|
||||
|
||||
private Type type = null;
|
||||
private int eventLength = -1;
|
||||
private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases
|
||||
// in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
||||
private SAMRecord read;
|
||||
private int offset; // position in the read immediately BEFORE the event
|
||||
|
||||
/** Constructor for extended pileup element (indel).
|
||||
*
|
||||
* @param read the read, in which the indel is observed
|
||||
* @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion)
|
||||
* @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT)
|
||||
* @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent)
|
||||
*/
|
||||
public ExtendedEventPileupElement( SAMRecord read, int offset, int length, byte[] eventBases ) {
|
||||
this.read = read;
|
||||
this.offset = offset;
|
||||
this.eventLength = length;
|
||||
if ( length <= 0 ) type = Type.NOEVENT;
|
||||
else {
|
||||
if ( eventBases != null ) {
|
||||
this.eventBases = new String(eventBases);
|
||||
type = Type.INSERTION;
|
||||
} else {
|
||||
type = Type.DELETION;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should
|
||||
* be null or are ignored in these cases anyway)
|
||||
* @param read
|
||||
* @param offset
|
||||
* @param length
|
||||
*/
|
||||
public ExtendedEventPileupElement( SAMRecord read, int offset, int length ) {
|
||||
this(read,offset, length, null);
|
||||
}
|
||||
|
||||
public boolean isDeletion() {
|
||||
return type == Type.DELETION;
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
return type == Type.INSERTION;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
return isDeletion() || isInsertion();
|
||||
}
|
||||
|
||||
public SAMRecord getRead() { return read; }
|
||||
|
||||
public int getOffset() { return offset; }
|
||||
|
||||
public int getMappingQual() { return read.getMappingQuality(); }
|
||||
|
||||
public Type getType() { return type; }
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return GenomeLocParser.createGenomeLoc(read.getReferenceIndex(),read.getAlignmentStart()+offset, read.getAlignmentStart()+offset+eventLength);
|
||||
}
|
||||
|
||||
/** Returns length of the event (number of inserted or deleted bases */
|
||||
public int getEventLength() { return eventLength; }
|
||||
|
||||
/** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read.
|
||||
* */
|
||||
public String getEventBases() { return eventBases; }
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
char c = '.';
|
||||
String fillStr = null ;
|
||||
if ( isDeletion() ) {
|
||||
c = '-';
|
||||
char [] filler = new char[eventLength];
|
||||
Arrays.fill(filler, 'D');
|
||||
fillStr = new String(filler);
|
||||
}
|
||||
else if ( isInsertion() ) c = '+';
|
||||
return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()?
|
||||
(isInsertion() ? eventBases : fillStr ): "", getMappingQual());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,324 @@
|
|||
package org.broadinstitute.sting.utils.pileup;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.gatk.iterators.IterableIterator;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: asivache
|
||||
* Date: Dec 29, 2009
|
||||
* Time: 12:25:55 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReadBackedExtendedEventPileup implements Iterable<ExtendedEventPileupElement> {
|
||||
private GenomeLoc loc = null;
|
||||
private ArrayList<ExtendedEventPileupElement> pileup = null;
|
||||
|
||||
private int size = 0; // cached value of the size of the pileup
|
||||
private int nDeletions = 0; // cached value of the number of deletions
|
||||
private int nInsertions = 0;
|
||||
private int nMQ0Reads = 0; // cached value of the number of MQ0 reads
|
||||
|
||||
/**
|
||||
* Create a new version of a read backed pileup at loc without any aligned reads
|
||||
*
|
||||
*/
|
||||
public ReadBackedExtendedEventPileup(GenomeLoc loc ) {
|
||||
this(loc, new ArrayList<ExtendedEventPileupElement>(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new version of a read backed pileup at loc, using the reads and their corresponding
|
||||
* offsets. This lower level constructure assumes pileup is well-formed and merely keeps a
|
||||
* pointer to pileup. Don't go changing the data in pileup.
|
||||
*
|
||||
*/
|
||||
public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList<ExtendedEventPileupElement> pileup ) {
|
||||
if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup");
|
||||
if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup");
|
||||
|
||||
this.loc = loc;
|
||||
this.pileup = pileup;
|
||||
|
||||
calculatedCachedData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimization of above constructor where all of the cached data is provided
|
||||
* @param loc
|
||||
* @param pileup
|
||||
*/
|
||||
public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList<ExtendedEventPileupElement> pileup, int size, int nInsertions, int nDeletions, int nMQ0Reads ) {
|
||||
if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup");
|
||||
if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup");
|
||||
|
||||
this.loc = loc;
|
||||
this.pileup = pileup;
|
||||
this.size = size;
|
||||
this.nDeletions = nDeletions;
|
||||
this.nInsertions = nInsertions;
|
||||
this.nMQ0Reads = nMQ0Reads;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront,
|
||||
* so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling
|
||||
* sizes, nDeletion, etc. over and over potentially.
|
||||
*/
|
||||
private void calculatedCachedData() {
|
||||
size = 0;
|
||||
nDeletions = 0;
|
||||
nInsertions = 0;
|
||||
nMQ0Reads = 0;
|
||||
|
||||
for ( ExtendedEventPileupElement p : this ) {
|
||||
size++;
|
||||
if ( p.isDeletion() ) {
|
||||
nDeletions++;
|
||||
} else {
|
||||
if ( p.isInsertion() ) nInsertions++;
|
||||
}
|
||||
|
||||
if ( p.getRead().getMappingQuality() == 0 ) {
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------
|
||||
//
|
||||
// Special 'constructors'
|
||||
//
|
||||
// --------------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this
|
||||
* does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy
|
||||
* of the pileup (just returns this) if there are no MQ0 reads in the pileup.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() {
|
||||
|
||||
if ( getNumberOfMappingQualityZeroReads() > 0 ) {
|
||||
ArrayList<ExtendedEventPileupElement> filteredPileup = new ArrayList<ExtendedEventPileupElement>();
|
||||
for ( ExtendedEventPileupElement p : pileup ) {
|
||||
if ( p.getRead().getMappingQuality() > 0 ) {
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
}
|
||||
return new ReadBackedExtendedEventPileup(loc, filteredPileup);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) {
|
||||
ArrayList<ExtendedEventPileupElement> filteredPileup = new ArrayList<ExtendedEventPileupElement>();
|
||||
|
||||
for ( ExtendedEventPileupElement p : pileup ) {
|
||||
if ( p.getRead().getMappingQuality() >= minMapQ ) {
|
||||
filteredPileup.add(p);
|
||||
}
|
||||
}
|
||||
|
||||
return new ReadBackedExtendedEventPileup(loc, filteredPileup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a pileup randomly downsampled to the desiredCoverage.
|
||||
*
|
||||
* @param desiredCoverage
|
||||
* @return
|
||||
*/
|
||||
public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) {
|
||||
if ( size() <= desiredCoverage )
|
||||
return this;
|
||||
|
||||
// randomly choose numbers corresponding to positions in the reads list
|
||||
Random generator = new Random();
|
||||
TreeSet<Integer> positions = new TreeSet<Integer>();
|
||||
for ( int i = 0; i < desiredCoverage; /* no update */ ) {
|
||||
if ( positions.add(generator.nextInt(pileup.size())) )
|
||||
i++;
|
||||
}
|
||||
|
||||
Iterator positionIter = positions.iterator();
|
||||
ArrayList<ExtendedEventPileupElement> downsampledPileup = new ArrayList<ExtendedEventPileupElement>();
|
||||
|
||||
while ( positionIter.hasNext() ) {
|
||||
int nextReadToKeep = (Integer)positionIter.next();
|
||||
downsampledPileup.add(pileup.get(nextReadToKeep));
|
||||
}
|
||||
|
||||
return new ReadBackedExtendedEventPileup(getLocation(), downsampledPileup);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
//
|
||||
// iterators
|
||||
//
|
||||
// --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* The best way to access PileupElements where you only care about the bases and quals in the pileup.
|
||||
*
|
||||
* for (PileupElement p : this) { doSomething(p); }
|
||||
*
|
||||
* Provides efficient iteration of the data.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<ExtendedEventPileupElement> iterator() {
|
||||
return pileup.iterator();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the number of deletion events in this pileup
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getNumberOfDeletions() {
|
||||
return nDeletions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of insertion events in this pileup
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public int getNumberOfInsertions() {
|
||||
return nInsertions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mapping quality zero reads in this pileup.
|
||||
* @return
|
||||
*/
|
||||
public int getNumberOfMappingQualityZeroReads() {
|
||||
return nMQ0Reads;
|
||||
}
|
||||
|
||||
// public int getNumberOfDeletions() {
|
||||
// int n = 0;
|
||||
//
|
||||
// for ( int i = 0; i < size(); i++ ) {
|
||||
// if ( getOffsets().get(i) != -1 ) { n++; }
|
||||
// }
|
||||
//
|
||||
// return n;
|
||||
// }
|
||||
|
||||
/**
|
||||
* @return the number of elements in this pileup
|
||||
*/
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the location of this pileup
|
||||
*/
|
||||
public GenomeLoc getLocation() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public String getShortPileupString() {
|
||||
// In the pileup format, each extended event line has genomic position (chromosome name and offset),
|
||||
// reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing
|
||||
// insertion, deletion or no-event, respectively.
|
||||
return String.format("%s %s E %s",
|
||||
getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate
|
||||
new String(getEvents()) );
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------
|
||||
//
|
||||
// Convenience functions that may be slow
|
||||
//
|
||||
// --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time
|
||||
* @return
|
||||
*/
|
||||
public List<SAMRecord> getReads() {
|
||||
List<SAMRecord> reads = new ArrayList<SAMRecord>(size());
|
||||
for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); }
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time
|
||||
* @return
|
||||
*/
|
||||
public List<Integer> getOffsets() {
|
||||
List<Integer> offsets = new ArrayList<Integer>(size());
|
||||
for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); }
|
||||
return offsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time
|
||||
* @return
|
||||
*/
|
||||
public byte[] getEvents() {
|
||||
byte[] v = new byte[size()];
|
||||
int i = 0;
|
||||
for ( ExtendedEventPileupElement e : this ) {
|
||||
switch ( e.getType() ) {
|
||||
case INSERTION: v[i] = 'I'; break;
|
||||
case DELETION: v[i] = 'D'; break;
|
||||
case NOEVENT: v[i] = '.'; break;
|
||||
default: throw new StingException("Unknown event type encountered: "+e.getType());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get an array of the mapping qualities
|
||||
* @return
|
||||
*/
|
||||
public byte[] getMappingQuals() {
|
||||
byte[] v = new byte[size()];
|
||||
int i = 0;
|
||||
for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); }
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Private functions for printing pileups
|
||||
//
|
||||
private String getMappingQualsString() {
|
||||
return quals2String(getMappingQuals());
|
||||
}
|
||||
|
||||
private static String quals2String( byte[] quals ) {
|
||||
StringBuilder qualStr = new StringBuilder();
|
||||
for ( int qual : quals ) {
|
||||
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
|
||||
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
|
||||
qualStr.append(qualChar);
|
||||
}
|
||||
|
||||
return qualStr.toString();
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue