From f445745c56c9cfb1131a562bb04f8d380a8cf487 Mon Sep 17 00:00:00 2001 From: asivache Date: Tue, 29 Dec 2009 20:03:39 +0000 Subject: [PATCH] Pileup element and corresponding container class tweaked for representing pileups of extended events (indels) at a given locus. There's some redundancy with PileupElement and ReadBackedPileup (should we rename them to BasePileupElement and ReadBackedBasePileup?), so that abstracting a basic interface/abstract base from these classes can be considered in the future git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2469 348d0f76-0448-11de-a6fe-93d51630548a --- .../pileup/ExtendedEventPileupElement.java | 126 +++++++ .../pileup/ReadBackedExtendedEventPileup.java | 324 ++++++++++++++++++ 2 files changed, 450 insertions(+) create mode 100644 java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java create mode 100644 java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java new file mode 100644 index 000000000..e5588631b --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import net.sf.samtools.SAMRecord; + +import java.util.Arrays; +import java.util.Collections; + +/** + * In the "standard" locus traversal mode, + * the traversal is performed striclty over the reference bases. Thus, only pileups of bases (and hence local events + * such as point mutations) are "seen" at every invocation of the walker's map() function at every (genomic) locus. Deletions + * are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted + * in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept. + * The insertions that may be present in some reads are not seen at all in such strict reference traversal mode. + * + * By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e. + * last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is + * an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice: + * first call will be performed in a "standard" mode, with a pileup of bases over the position Z, and then the additional + * call will be made at the same position with a pileup of event/noevent calls, where events are extended and contain + * full information about insertions/deletions. Then the next, "standard", call to map() will be performed at the next + * (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at + * Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed + * only once, at the position where the indel maps (starts). + * + * This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location. + * + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 21, 2009 + * Time: 2:57:55 PM + * To change this template use File | Settings | File Templates. + */ +public class ExtendedEventPileupElement { + public enum Type { + NOEVENT, DELETION, INSERTION + }; + + private Type type = null; + private int eventLength = -1; + private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases + // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + private SAMRecord read; + private int offset; // position in the read immediately BEFORE the event + + /** Constructor for extended pileup element (indel). + * + * @param read the read, in which the indel is observed + * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) + * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) + * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) + */ + public ExtendedEventPileupElement( SAMRecord read, int offset, int length, byte[] eventBases ) { + this.read = read; + this.offset = offset; + this.eventLength = length; + if ( length <= 0 ) type = Type.NOEVENT; + else { + if ( eventBases != null ) { + this.eventBases = new String(eventBases); + type = Type.INSERTION; + } else { + type = Type.DELETION; + } + } + } + + /** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should + * be null or are ignored in these cases anyway) + * @param read + * @param offset + * @param length + */ + public ExtendedEventPileupElement( SAMRecord read, int offset, int length ) { + this(read,offset, length, null); + } + + public boolean isDeletion() { + return type == Type.DELETION; + } + + public boolean isInsertion() { + return type == Type.INSERTION; + } + + public boolean isIndel() { + return isDeletion() || isInsertion(); + } + + public SAMRecord getRead() { return read; } + + public int getOffset() { return offset; } + + public int getMappingQual() { return read.getMappingQuality(); } + + public Type getType() { return type; } + + public GenomeLoc getLocation() { + return GenomeLocParser.createGenomeLoc(read.getReferenceIndex(),read.getAlignmentStart()+offset, read.getAlignmentStart()+offset+eventLength); + } + + /** Returns length of the event (number of inserted or deleted bases */ + public int getEventLength() { return eventLength; } + + /** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + * */ + public String getEventBases() { return eventBases; } + + @Override + public String toString() { + char c = '.'; + String fillStr = null ; + if ( isDeletion() ) { + c = '-'; + char [] filler = new char[eventLength]; + Arrays.fill(filler, 'D'); + fillStr = new String(filler); + } + else if ( isInsertion() ) c = '+'; + return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()? + (isInsertion() ? eventBases : fillStr ): "", getMappingQual()); + } + +} diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java new file mode 100644 index 000000000..7885681db --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -0,0 +1,324 @@ +package org.broadinstitute.sting.utils.pileup; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.gatk.iterators.IterableIterator; + +import java.util.*; + +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 29, 2009 + * Time: 12:25:55 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadBackedExtendedEventPileup implements Iterable { + private GenomeLoc loc = null; + private ArrayList pileup = null; + + private int size = 0; // cached value of the size of the pileup + private int nDeletions = 0; // cached value of the number of deletions + private int nInsertions = 0; + private int nMQ0Reads = 0; // cached value of the number of MQ0 reads + + /** + * Create a new version of a read backed pileup at loc without any aligned reads + * + */ + public ReadBackedExtendedEventPileup(GenomeLoc loc ) { + this(loc, new ArrayList(0)); + } + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a + * pointer to pileup. Don't go changing the data in pileup. + * + */ + public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList pileup ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); + + this.loc = loc; + this.pileup = pileup; + + calculatedCachedData(); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * @param loc + * @param pileup + */ + public ReadBackedExtendedEventPileup(GenomeLoc loc, ArrayList pileup, int size, int nInsertions, int nDeletions, int nMQ0Reads ) { + if ( loc == null ) throw new StingException("Illegal null genomeloc in ReadBackedExtendedEventPileup"); + if ( pileup == null ) throw new StingException("Illegal null pileup in ReadBackedExtendedEventPileup"); + + this.loc = loc; + this.pileup = pileup; + this.size = size; + this.nDeletions = nDeletions; + this.nInsertions = nInsertions; + this.nMQ0Reads = nMQ0Reads; + } + + + /** + * Calculate cached sizes, nDeletion, and base counts for the pileup. This calculation is done upfront, + * so you pay the cost at the start, but it's more efficient to do this rather than pay the cost of calling + * sizes, nDeletion, etc. over and over potentially. + */ + private void calculatedCachedData() { + size = 0; + nDeletions = 0; + nInsertions = 0; + nMQ0Reads = 0; + + for ( ExtendedEventPileupElement p : this ) { + size++; + if ( p.isDeletion() ) { + nDeletions++; + } else { + if ( p.isInsertion() ) nInsertions++; + } + + if ( p.getRead().getMappingQuality() == 0 ) { + nMQ0Reads++; + } + } + } + + + // -------------------------------------------------------- + // + // Special 'constructors' + // + // -------------------------------------------------------- + + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + public ReadBackedExtendedEventPileup getPileupWithoutMappingQualityZeroReads() { + + if ( getNumberOfMappingQualityZeroReads() > 0 ) { + ArrayList filteredPileup = new ArrayList(); + for ( ExtendedEventPileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() > 0 ) { + filteredPileup.add(p); + } + } + return new ReadBackedExtendedEventPileup(loc, filteredPileup); + } else { + return this; + } + } + + public ReadBackedExtendedEventPileup getMappingFilteredPileup( int minMapQ ) { + ArrayList filteredPileup = new ArrayList(); + + for ( ExtendedEventPileupElement p : pileup ) { + if ( p.getRead().getMappingQuality() >= minMapQ ) { + filteredPileup.add(p); + } + } + + return new ReadBackedExtendedEventPileup(loc, filteredPileup); + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * @param desiredCoverage + * @return + */ + public ReadBackedExtendedEventPileup getDownsampledPileup(int desiredCoverage) { + if ( size() <= desiredCoverage ) + return this; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + for ( int i = 0; i < desiredCoverage; /* no update */ ) { + if ( positions.add(generator.nextInt(pileup.size())) ) + i++; + } + + Iterator positionIter = positions.iterator(); + ArrayList downsampledPileup = new ArrayList(); + + while ( positionIter.hasNext() ) { + int nextReadToKeep = (Integer)positionIter.next(); + downsampledPileup.add(pileup.get(nextReadToKeep)); + } + + return new ReadBackedExtendedEventPileup(getLocation(), downsampledPileup); + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + * + * for (PileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + public Iterator iterator() { + return pileup.iterator(); + } + + + /** + * Returns the number of deletion events in this pileup + * + * @return + */ + public int getNumberOfDeletions() { + return nDeletions; + } + + /** + * Returns the number of insertion events in this pileup + * + * @return + */ + public int getNumberOfInsertions() { + return nInsertions; + } + + /** + * Returns the number of mapping quality zero reads in this pileup. + * @return + */ + public int getNumberOfMappingQualityZeroReads() { + return nMQ0Reads; + } + +// public int getNumberOfDeletions() { +// int n = 0; +// +// for ( int i = 0; i < size(); i++ ) { +// if ( getOffsets().get(i) != -1 ) { n++; } +// } +// +// return n; +// } + + /** + * @return the number of elements in this pileup + */ + public int size() { + return size; + } + + /** + * @return the location of this pileup + */ + public GenomeLoc getLocation() { + return loc; + } + + + + public String getShortPileupString() { + // In the pileup format, each extended event line has genomic position (chromosome name and offset), + // reference "base" (always set to "E" for E(xtended)), then 'I','D', or '.' symbol for each read representing + // insertion, deletion or no-event, respectively. + return String.format("%s %s E %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + new String(getEvents()) ); + } + + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + public List getReads() { + List reads = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { reads.add(pile.getRead()); } + return reads; + } + + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * @return + */ + public List getOffsets() { + List offsets = new ArrayList(size()); + for ( ExtendedEventPileupElement pile : this ) { offsets.add(pile.getOffset()); } + return offsets; + } + + /** + * Returns an array of the events in this pileup ('I', 'D', or '.'). Note this call costs O(n) and allocates fresh array each time + * @return + */ + public byte[] getEvents() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { + switch ( e.getType() ) { + case INSERTION: v[i] = 'I'; break; + case DELETION: v[i] = 'D'; break; + case NOEVENT: v[i] = '.'; break; + default: throw new StingException("Unknown event type encountered: "+e.getType()); + } + i++; + } + return v; + } + + + /** + * Get an array of the mapping qualities + * @return + */ + public byte[] getMappingQuals() { + byte[] v = new byte[size()]; + int i = 0; + for ( ExtendedEventPileupElement e : this ) { v[i++] = (byte)e.getRead().getMappingQuality(); } + return v; + } + + + // + // Private functions for printing pileups + // + private String getMappingQualsString() { + return quals2String(getMappingQuals()); + } + + private static String quals2String( byte[] quals ) { + StringBuilder qualStr = new StringBuilder(); + for ( int qual : quals ) { + qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea + char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 + qualStr.append(qualChar); + } + + return qualStr.toString(); + } + +}