A clean, fast way to compute fragment pileups. Now consumes no CPU time at all. Ready for general use.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-27 14:26:29 +00:00
parent bae0b6cba8
commit 231d095316
2 changed files with 99 additions and 70 deletions

View File

@ -34,10 +34,6 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import static java.lang.Math.log10; import static java.lang.Math.log10;
import static java.lang.Math.pow; import static java.lang.Math.pow;
@ -240,6 +236,14 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return getPriors()[g.ordinal()]; return getPriors()[g.ordinal()];
} }
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
public int add(ReadBackedPileup pileup) { public int add(ReadBackedPileup pileup) {
return add(pileup, false, false); return add(pileup, false, false);
} }
@ -258,59 +262,58 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
int n = 0; int n = 0;
// for each fragment, add to the likelihoods // for each fragment, add to the likelihoods
for ( FragmentPileup.PerFragmentPileupElement fragment : new FragmentPileup(pileup) ) { FragmentPileup fpile = new FragmentPileup(pileup);
n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual);
} for ( PileupElement p : fpile.getOneReadPileup() )
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual);
for ( FragmentPileup.TwoReadPileupElement twoRead : fpile.getTwoReadPileup() )
n += add(twoRead, ignoreBadBases, capBaseQualsAtMappingQual);
return n; return n;
} }
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
return add(new FragmentPileup.PerFragmentPileupElement(elt), ignoreBadBases, capBaseQualsAtMappingQual); byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual);
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
} }
private final static byte qualToUse(PileupElement p, boolean capBaseQualsAtMappingQual) { public int add(FragmentPileup.TwoReadPileupElement twoRead, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
byte qual = p.getQual(); final byte observedBase1 = twoRead.getFirst().getBase();
final byte qualityScore1 = qualToUse(twoRead.getFirst(), ignoreBadBases, capBaseQualsAtMappingQual);
final byte observedBase2 = twoRead.getSecond().getBase();
final byte qualityScore2 = qualToUse(twoRead.getSecond(), ignoreBadBases, capBaseQualsAtMappingQual);
if ( qual > SAMUtils.MAX_PHRED_SCORE ) if ( qualityScore1 == 0 ) {
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
if ( capBaseQualsAtMappingQual ) return 0;
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual()); else {
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
return qual; }
} else {
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
}
} }
public int add(FragmentPileup.PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { /**
*
* @param obsBase1
* @param qual1
* @param obsBase2
* @param qual2 can be 0, indicating no second base was observed for this fragment
* @return
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0;
if ( usableBase(fragment.getFirst(), ignoreBadBases) ) {
observedBase1 = fragment.getFirst().getBase();
qualityScore1 = qualToUse(fragment.getFirst(), capBaseQualsAtMappingQual);
}
if ( fragment.hasSecond() && usableBase(fragment.getSecond(), ignoreBadBases) ) {
observedBase2 = fragment.getSecond().getBase();
qualityScore2 = qualToUse(fragment.getSecond(), capBaseQualsAtMappingQual);
}
if ( observedBase1 == 0 ) {
if ( observedBase2 == 0 ) // abort early if we didn't see any good bases
return 0;
else { // otherwise make 2 1
observedBase1 = observedBase2;
qualityScore1 = qualityScore2;
observedBase2 = qualityScore2 = 0;
}
}
// Just look up the cached result if it's available, or compute and store it // Just look up the cached result if it's available, or compute and store it
DiploidSNPGenotypeLikelihoods gl; DiploidSNPGenotypeLikelihoods gl;
if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) { if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY); gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} else { } else {
gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY); gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} }
// for bad bases, there are no likelihoods // for bad bases, there are no likelihoods
@ -334,6 +337,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return 1; return 1;
} }
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines
//
// -------------------------------------------------------------------------------------
static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY]; static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
@ -475,6 +484,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
return logP; return logP;
} }
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p
* @param ignoreBadBases
* @param capBaseQualsAtMappingQual
* @return
*/
private final static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
if ( ! usableBase(p, ignoreBadBases) ) {
return 0;
} else {
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
return qual;
}
}
// ----------------------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------------------
// //
// //

View File

@ -3,17 +3,23 @@ package org.broadinstitute.sting.utils.pileup;
import java.util.*; import java.util.*;
/** /**
* An easy to access fragment-based pileup. new FragmentPileup(RBP) creates one, and you * An easy to access fragment-based pileup, which contains two separate pileups. The first
* can either iterate over or get the collection of PerFragmentPileupElements. * is a regular collection of PileupElements containing all of the reads in the original RBP
* that uniquely info about a fragment. The second are TwoReadPileupElements that, as the
* name suggests, contain two reads that are sequenced from the same underlying fragment.
* *
* Based on the original code by E. Banks * Based on the original code by E. Banks
* *
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
* TODO -- algorithm that could collect all duplicates into a single super pileup element
*
* User: depristo * User: depristo
* Date: 3/26/11 * Date: 3/26/11
* Time: 10:09 PM * Time: 10:09 PM
*/ */
public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileupElement> { public class FragmentPileup {
final Collection<PerFragmentPileupElement> fragments = new ArrayList<PerFragmentPileupElement>(); final Collection<PileupElement> oneReadPile;
final Collection<TwoReadPileupElement> twoReadPile = new ArrayList<TwoReadPileupElement>();
/** /**
* Create a new Fragment-based pileup from the standard read-based pileup * Create a new Fragment-based pileup from the standard read-based pileup
@ -29,58 +35,50 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
PileupElement pe1 = nameMap.get(readName); PileupElement pe1 = nameMap.get(readName);
if ( pe1 != null ) { if ( pe1 != null ) {
// assumes we have at most 2 reads per fragment // assumes we have at most 2 reads per fragment
fragments.add(new PerFragmentPileupElement(pe1, p)); twoReadPile.add(new TwoReadPileupElement(pe1, p));
nameMap.remove(readName); nameMap.remove(readName);
} else { } else {
nameMap.put(readName, p); nameMap.put(readName, p);
} }
} }
// now go through the values in the nameMap to get the fragments with only a single read // now set the one Read pile to the values in the nameMap with only a single read
for ( PileupElement p : nameMap.values() ) oneReadPile = nameMap.values();
fragments.add(new PerFragmentPileupElement(p));
} }
/** /**
* Gets the fragments, in no particular order * Gets the pileup elements containing two reads, in no particular order
* *
* @return * @return
*/ */
public Collection<PerFragmentPileupElement> getFragments() { public Collection<TwoReadPileupElement> getTwoReadPileup() {
return fragments; return twoReadPile;
} }
/** /**
* Returns an iterator over the fragments. No specific order of fragments is assumed * Gets the pileup elements containing one read, in no particular order
*
* @return * @return
*/ */
public Iterator<PerFragmentPileupElement> iterator() { public Collection<PileupElement> getOneReadPileup() {
return fragments.iterator(); return oneReadPile;
} }
/** /**
* Useful helper class to represent a full read pair at a position * Useful helper class to represent a full read pair at a position
* *
* User: ebanks * User: ebanks, depristo
* Date: Jan 10, 2011 * Date: Jan 10, 2011
*/ */
public static class PerFragmentPileupElement { public static class TwoReadPileupElement {
protected PileupElement PE1 = null, PE2 = null; final protected PileupElement PE1, PE2;
/**
* Creates a fragment element that only contains a single read
* @param PE
*/
public PerFragmentPileupElement(PileupElement PE) {
PE1 = PE;
}
/** /**
* Creates a fragment element that contains both ends of a paired end read * Creates a fragment element that contains both ends of a paired end read
* @param PE1 * @param PE1
* @param PE2 * @param PE2
*/ */
public PerFragmentPileupElement(PileupElement PE1, PileupElement PE2) { public TwoReadPileupElement(PileupElement PE1, PileupElement PE2) {
this.PE1 = PE1; this.PE1 = PE1;
this.PE2 = PE2; this.PE2 = PE2;
} }
@ -88,9 +86,6 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
/** Returns the first pileup element -- never null */ /** Returns the first pileup element -- never null */
public PileupElement getFirst() { return PE1; } public PileupElement getFirst() { return PE1; }
/** Is there a second read in this fragment element? */
public boolean hasSecond() { return PE2 != null; }
/** Returns the second read in this fragment element. May be null */ /** Returns the second read in this fragment element. May be null */
public PileupElement getSecond() { return PE2; } public PileupElement getSecond() { return PE2; }
} }