A clean, fast way to compute fragment pileups. Now consumes no CPU time at all. Ready for general use.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5524 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bae0b6cba8
commit
231d095316
|
|
@ -34,10 +34,6 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
|
@ -240,6 +236,14 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false, false);
|
||||
}
|
||||
|
|
@ -258,59 +262,58 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
int n = 0;
|
||||
|
||||
// for each fragment, add to the likelihoods
|
||||
for ( FragmentPileup.PerFragmentPileupElement fragment : new FragmentPileup(pileup) ) {
|
||||
n += add(fragment, ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
}
|
||||
FragmentPileup fpile = new FragmentPileup(pileup);
|
||||
|
||||
for ( PileupElement p : fpile.getOneReadPileup() )
|
||||
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
|
||||
for ( FragmentPileup.TwoReadPileupElement twoRead : fpile.getTwoReadPileup() )
|
||||
n += add(twoRead, ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
|
||||
return n;
|
||||
}
|
||||
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
return add(new FragmentPileup.PerFragmentPileupElement(elt), ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0) : 0;
|
||||
}
|
||||
|
||||
private final static byte qualToUse(PileupElement p, boolean capBaseQualsAtMappingQual) {
|
||||
byte qual = p.getQual();
|
||||
public int add(FragmentPileup.TwoReadPileupElement twoRead, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
final byte observedBase1 = twoRead.getFirst().getBase();
|
||||
final byte qualityScore1 = qualToUse(twoRead.getFirst(), ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
final byte observedBase2 = twoRead.getSecond().getBase();
|
||||
final byte qualityScore2 = qualToUse(twoRead.getSecond(), ignoreBadBases, capBaseQualsAtMappingQual);
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
|
||||
return qual;
|
||||
if ( qualityScore1 == 0 ) {
|
||||
if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
|
||||
return 0;
|
||||
else {
|
||||
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
|
||||
}
|
||||
} else {
|
||||
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
|
||||
}
|
||||
}
|
||||
|
||||
public int add(FragmentPileup.PerFragmentPileupElement fragment, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
/**
|
||||
*
|
||||
* @param obsBase1
|
||||
* @param qual1
|
||||
* @param obsBase2
|
||||
* @param qual2 can be 0, indicating no second base was observed for this fragment
|
||||
* @return
|
||||
*/
|
||||
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
|
||||
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
|
||||
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
|
||||
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
|
||||
byte observedBase1 = 0, observedBase2 = 0, qualityScore1 = 0, qualityScore2 = 0;
|
||||
|
||||
if ( usableBase(fragment.getFirst(), ignoreBadBases) ) {
|
||||
observedBase1 = fragment.getFirst().getBase();
|
||||
qualityScore1 = qualToUse(fragment.getFirst(), capBaseQualsAtMappingQual);
|
||||
}
|
||||
|
||||
if ( fragment.hasSecond() && usableBase(fragment.getSecond(), ignoreBadBases) ) {
|
||||
observedBase2 = fragment.getSecond().getBase();
|
||||
qualityScore2 = qualToUse(fragment.getSecond(), capBaseQualsAtMappingQual);
|
||||
}
|
||||
|
||||
if ( observedBase1 == 0 ) {
|
||||
if ( observedBase2 == 0 ) // abort early if we didn't see any good bases
|
||||
return 0;
|
||||
else { // otherwise make 2 1
|
||||
observedBase1 = observedBase2;
|
||||
qualityScore1 = qualityScore2;
|
||||
observedBase2 = qualityScore2 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Just look up the cached result if it's available, or compute and store it
|
||||
DiploidSNPGenotypeLikelihoods gl;
|
||||
if ( ! inCache(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
|
||||
if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
|
||||
} else {
|
||||
gl = getCachedGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2, FIXED_PLOIDY);
|
||||
gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
|
||||
}
|
||||
|
||||
// for bad bases, there are no likelihoods
|
||||
|
|
@ -334,6 +337,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
return 1;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Dealing with the cache routines
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
static DiploidSNPGenotypeLikelihoods[][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
|
||||
|
||||
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
|
||||
|
|
@ -475,6 +484,31 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
return logP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that returns the phred-scaled base quality score we should use for calculating
|
||||
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
|
||||
* cap the quality score by the mapping quality of the read itself.
|
||||
*
|
||||
* @param p
|
||||
* @param ignoreBadBases
|
||||
* @param capBaseQualsAtMappingQual
|
||||
* @return
|
||||
*/
|
||||
private final static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
if ( ! usableBase(p, ignoreBadBases) ) {
|
||||
return 0;
|
||||
} else {
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)p.getQual(), p.getMappingQual());
|
||||
|
||||
return qual;
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
|
|
|
|||
|
|
@ -3,17 +3,23 @@ package org.broadinstitute.sting.utils.pileup;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* An easy to access fragment-based pileup. new FragmentPileup(RBP) creates one, and you
|
||||
* can either iterate over or get the collection of PerFragmentPileupElements.
|
||||
* An easy to access fragment-based pileup, which contains two separate pileups. The first
|
||||
* is a regular collection of PileupElements containing all of the reads in the original RBP
|
||||
* that uniquely info about a fragment. The second are TwoReadPileupElements that, as the
|
||||
* name suggests, contain two reads that are sequenced from the same underlying fragment.
|
||||
*
|
||||
* Based on the original code by E. Banks
|
||||
*
|
||||
* TODO -- technically we could generalize this code to support a pseudo-duplicate marking
|
||||
* TODO -- algorithm that could collect all duplicates into a single super pileup element
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/26/11
|
||||
* Time: 10:09 PM
|
||||
*/
|
||||
public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileupElement> {
|
||||
final Collection<PerFragmentPileupElement> fragments = new ArrayList<PerFragmentPileupElement>();
|
||||
public class FragmentPileup {
|
||||
final Collection<PileupElement> oneReadPile;
|
||||
final Collection<TwoReadPileupElement> twoReadPile = new ArrayList<TwoReadPileupElement>();
|
||||
|
||||
/**
|
||||
* Create a new Fragment-based pileup from the standard read-based pileup
|
||||
|
|
@ -29,58 +35,50 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
|
|||
PileupElement pe1 = nameMap.get(readName);
|
||||
if ( pe1 != null ) {
|
||||
// assumes we have at most 2 reads per fragment
|
||||
fragments.add(new PerFragmentPileupElement(pe1, p));
|
||||
twoReadPile.add(new TwoReadPileupElement(pe1, p));
|
||||
nameMap.remove(readName);
|
||||
} else {
|
||||
nameMap.put(readName, p);
|
||||
}
|
||||
}
|
||||
|
||||
// now go through the values in the nameMap to get the fragments with only a single read
|
||||
for ( PileupElement p : nameMap.values() )
|
||||
fragments.add(new PerFragmentPileupElement(p));
|
||||
// now set the one Read pile to the values in the nameMap with only a single read
|
||||
oneReadPile = nameMap.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the fragments, in no particular order
|
||||
* Gets the pileup elements containing two reads, in no particular order
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Collection<PerFragmentPileupElement> getFragments() {
|
||||
return fragments;
|
||||
public Collection<TwoReadPileupElement> getTwoReadPileup() {
|
||||
return twoReadPile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator over the fragments. No specific order of fragments is assumed
|
||||
* Gets the pileup elements containing one read, in no particular order
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Iterator<PerFragmentPileupElement> iterator() {
|
||||
return fragments.iterator();
|
||||
public Collection<PileupElement> getOneReadPileup() {
|
||||
return oneReadPile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful helper class to represent a full read pair at a position
|
||||
*
|
||||
* User: ebanks
|
||||
* User: ebanks, depristo
|
||||
* Date: Jan 10, 2011
|
||||
*/
|
||||
public static class PerFragmentPileupElement {
|
||||
protected PileupElement PE1 = null, PE2 = null;
|
||||
|
||||
/**
|
||||
* Creates a fragment element that only contains a single read
|
||||
* @param PE
|
||||
*/
|
||||
public PerFragmentPileupElement(PileupElement PE) {
|
||||
PE1 = PE;
|
||||
}
|
||||
public static class TwoReadPileupElement {
|
||||
final protected PileupElement PE1, PE2;
|
||||
|
||||
/**
|
||||
* Creates a fragment element that contains both ends of a paired end read
|
||||
* @param PE1
|
||||
* @param PE2
|
||||
*/
|
||||
public PerFragmentPileupElement(PileupElement PE1, PileupElement PE2) {
|
||||
public TwoReadPileupElement(PileupElement PE1, PileupElement PE2) {
|
||||
this.PE1 = PE1;
|
||||
this.PE2 = PE2;
|
||||
}
|
||||
|
|
@ -88,9 +86,6 @@ public class FragmentPileup implements Iterable<FragmentPileup.PerFragmentPileup
|
|||
/** Returns the first pileup element -- never null */
|
||||
public PileupElement getFirst() { return PE1; }
|
||||
|
||||
/** Is there a second read in this fragment element? */
|
||||
public boolean hasSecond() { return PE2 != null; }
|
||||
|
||||
/** Returns the second read in this fragment element. May be null */
|
||||
public PileupElement getSecond() { return PE2; }
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue