A clean, fast way to compute fragment pileups. Now consumes no CPU time at all. Ready for general use.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-27 14:26:29 +00:00
parent bae0b6cba8
commit 231d095316
2 changed files with 99 additions and 70 deletions

View File

@ -34,10 +34,6 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -240,6 +236,14 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return getPriors()[g.ordinal()];
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
public int add(ReadBackedPileup pileup) {
return add(pileup, false, false);
}
@ -258,59 +262,58 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
int n = 0;
// for each fragment, add to the likelihoods
for ( FragmentPileup.PerFragmentPileupElement fragment : new FragmentPileup(pileup) ) {
n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual);
}
FragmentPileup fpile = new FragmentPileup(pileup);
for ( PileupElement p : fpile.getOneReadPileup() )
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual);
for ( FragmentPileup.TwoReadPileupElement twoRead : fpile.getTwoReadPileup() )
n += add(twoRead, ignoreBadBases, capBaseQualsAtMappingQual);
return n;
}
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
return add(new FragmentPileup.PerFragmentPileupElement(elt), ignoreBadBases, capBaseQualsAtMappingQual);
byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual);
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
}
private final static byte qualToUse(PileupElement p, boolean capBaseQualsAtMappingQual) {
byte qual = p.getQual();
public int add(FragmentPileup.TwoReadPileupElement twoRead, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
final byte observedBase1 = twoRead.getFirst().getBase();
final byte qualityScore1 = qualToUse(twoRead.getFirst(), ignoreBadBases, capBaseQualsAtMappingQual);
final byte observedBase2 = twoRead.getSecond().getBase();
final byte qualityScore2 = qualToUse(twoRead.getSecond(), ignoreBadBases, capBaseQualsAtMappingQual);
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
return qual;
if ( qualityScore1 == 0 ) {
if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
return 0;
else {
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
}
} else {
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
}
}
public int add(FragmentPileup.PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
/**
*
* @param obsBase1
* @param qual1
* @param obsBase2
* @param qual2 can be 0, indicating no second base was observed for this fragment
* @return
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0;
if ( usableBase(fragment.getFirst(), ignoreBadBases) ) {
observedBase1 = fragment.getFirst().getBase();
qualityScore1 = qualToUse(fragment.getFirst(), capBaseQualsAtMappingQual);
}
if ( fragment.hasSecond() && usableBase(fragment.getSecond(), ignoreBadBases) ) {
observedBase2 = fragment.getSecond().getBase();
qualityScore2 = qualToUse(fragment.getSecond(), capBaseQualsAtMappingQual);
}
if ( observedBase1 == 0 ) {
if ( observedBase2 == 0 ) // abort early if we didn't see any good bases
return 0;
else { // otherwise make 2 1
observedBase1 = observedBase2;
qualityScore1 = qualityScore2;
observedBase2 = qualityScore2 = 0;
}
}
// Just look up the cached result if it's available, or compute and store it
DiploidSNPGenotypeLikelihoods gl;
if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} else {
gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
}
// for bad bases, there are no likelihoods
@ -334,6 +337,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return 1;
}
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines
//
// -------------------------------------------------------------------------------------
static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
@ -475,6 +484,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return logP;
}
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p
* @param ignoreBadBases
* @param capBaseQualsAtMappingQual
* @return
*/
private final static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
if ( ! usableBase(p, ignoreBadBases) ) {
return 0;
} else {
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
return qual;
}
}
// -----------------------------------------------------------------------------------------------------------------
//
//

View File

@ -3,17 +3,23 @@ package org.broadinstitute.sting.utils.pileup;
import java.util.*;
/**
* An easy to access fragment-based pileup. new FragmentPileup(RBP) creates one, and you
* can either iterate over or get the collection of PerFragmentPileupElements.
* An easy to access fragment-based pileup, which contains two separate pileups. The first
* is a regular collection of PileupElements containing all of the reads in the original RBP
* that uniquely info about a fragment. The second are TwoReadPileupElements that, as the
* name suggests, contain two reads that are sequenced from the same underlying fragment.
*
* Based on the original code by E. Banks
*
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
* TODO -- algorithm that could collect all duplicates into a single super pileup element
*
* User: depristo
* Date: 3/26/11
* Time: 10:09 PM
*/
public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileupElement> {
final Collection<PerFragmentPileupElement> fragments = new ArrayList<PerFragmentPileupElement>();
public class FragmentPileup {
final Collection<PileupElement> oneReadPile;
final Collection<TwoReadPileupElement> twoReadPile = new ArrayList<TwoReadPileupElement>();
/**
* Create a new Fragment-based pileup from the standard read-based pileup
@ -29,58 +35,50 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
PileupElement pe1 = nameMap.get(readName);
if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment
fragments.add(new PerFragmentPileupElement(pe1, p));
twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName);
} else {
nameMap.put(readName, p);
}
}
// now go through the values in the nameMap to get the fragments with only a single read
for ( PileupElement p : nameMap.values() )
fragments.add(new PerFragmentPileupElement(p));
// now set the one Read pile to the values in the nameMap with only a single read
oneReadPile = nameMap.values();
}
/**
* Gets the fragments, in no particular order
* Gets the pileup elements containing two reads, in no particular order
*
* @return
*/
public Collection<PerFragmentPileupElement> getFragments() {
return fragments;
public Collection<TwoReadPileupElement> getTwoReadPileup() {
return twoReadPile;
}
/**
* Returns an iterator over the fragments. No specific order of fragments is assumed
* Gets the pileup elements containing one read, in no particular order
*
* @return
*/
public Iterator<PerFragmentPileupElement> iterator() {
return fragments.iterator();
public Collection<PileupElement> getOneReadPileup() {
return oneReadPile;
}
/**
* Useful helper class to represent a full read pair at a position
*
* User: ebanks
* User: ebanks, depristo
* Date: Jan 10, 2011
*/
public static class PerFragmentPileupElement {
protected PileupElement PE1 = null, PE2 = null;
/**
* Creates a fragment element that only contains a single read
* @param PE
*/
public PerFragmentPileupElement(PileupElement PE) {
PE1 = PE;
}
public static class TwoReadPileupElement {
final protected PileupElement PE1, PE2;
/**
* Creates a fragment element that contains both ends of a paired end read
* @param PE1
* @param PE2
*/
public PerFragmentPileupElement(PileupElement PE1, PileupElement PE2) {
public TwoReadPileupElement(PileupElement PE1, PileupElement PE2) {
this.PE1 = PE1;
this.PE2 = PE2;
}
@ -88,9 +86,6 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
/** Returns the first pileup element -- never null */
public PileupElement getFirst() { return PE1; }
/** Is there a second read in this fragment element? */
public boolean hasSecond() { return PE2 != null; }
/** Returns the second read in this fragment element. May be null */
public PileupElement getSecond() { return PE2; }
}