Improvements to indel calling in pool caller: a) Compute per-read likelihoods in reference sample to determine wheter a read is informative or not. b) Fixed bugs in unit tests. c) Fixed padding-related bugs when computing matches/mismatches in ErrorModel, d) Added a couple of more integration tests to increase test coverage, including testing odd ploidy

This commit is contained in:
Guillermo del Angel 2012-07-26 13:43:00 -04:00
parent 1e8610b2c6
commit 2ae890155c
8 changed files with 120 additions and 46 deletions

View File

@ -1,6 +1,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -9,6 +13,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
/**
* Created by IntelliJ IDEA.
@ -30,24 +35,26 @@ public class ErrorModel {
private static final boolean compressRange = false;
private static final double log10MinusE = Math.log10(Math.exp(1.0));
private static final boolean DEBUG = false;
/**
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
*
* @param minQualityScore Minimum site quality score to evaluate
* @param maxQualityScore Maximum site quality score to evaluate
* @param phredScaledPrior Prior for site quality
* @param UAC Argument Collection
* @param refSamplePileup Reference sample pileup
* @param refSampleVC VC with True alleles in reference sample pileup
* @param minPower Minimum power
*/
public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
this.maxQualityScore = maxQualityScore;
this.minQualityScore = minQualityScore;
this.phredScaledPrior = phredScaledPrior;
log10minPower = Math.log10(minPower);
public ErrorModel (final UnifiedArgumentCollection UAC,
final ReadBackedPileup refSamplePileup,
VariantContext refSampleVC, final ReferenceContext refContext) {
this.maxQualityScore = UAC.maxQualityScore;
this.minQualityScore = UAC.minQualityScore;
this.phredScaledPrior = UAC.phredScaledPrior;
log10minPower = Math.log10(UAC.minPower);
PairHMMIndelErrorModel pairModel = null;
LinkedHashMap<Allele, Haplotype> haplotypeMap = null;
HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = null;
double[][] perReadLikelihoods = null;
double[] model = new double[maxQualityScore+1];
Arrays.fill(model,Double.NEGATIVE_INFINITY);
@ -61,11 +68,17 @@ public class ErrorModel {
break;
}
}
if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
indelLikelihoodMap = new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
}
}
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
// maximum uncertainty if there's no ref data at site
model[q] = p;
@ -75,23 +88,47 @@ public class ErrorModel {
else {
hasData = true;
int matches = 0;
int coverage = refSamplePileup.getNumberOfElements();
int coverage = 0;
Allele refAllele = refSampleVC.getReference();
if (refSampleVC.isIndel()) {
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
}
int idx = 0;
for (PileupElement refPileupElement : refSamplePileup) {
if (DEBUG)
System.out.println(refPileupElement.toString());
boolean isMatch = false;
for (Allele allele : refSampleVC.getAlleles())
isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
for (Allele allele : refSampleVC.getAlleles()) {
boolean m = pileupElementMatches(refPileupElement, allele, refAllele, refContext.getBase());
if (DEBUG) System.out.println(m);
isMatch |= m;
}
if (refSampleVC.isIndel()) {
// ignore match/mismatch if reads, as determined by their likelihood, are not informative
double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
if (!isInformativeElement(perAlleleLikelihoods))
matches++;
else
matches += (isMatch?1:0);
matches += (isMatch?1:0);
// System.out.format("MATCH:%b\n",isMatch);
} else {
matches += (isMatch?1:0);
}
coverage++;
}
int mismatches = coverage - matches;
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
if (coverage==0)
model[q] = p;
else
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
}
this.refDepth = coverage;
}
@ -101,6 +138,17 @@ public class ErrorModel {
}
@Requires("likelihoods.length>0")
private boolean isInformativeElement(double[] likelihoods) {
// if likelihoods are the same, they're not informative
final double thresh = 0.1;
int maxIdx = MathUtils.maxElementIndex(likelihoods);
int minIdx = MathUtils.minElementIndex(likelihoods);
if (likelihoods[maxIdx]-likelihoods[minIdx]< thresh)
return false;
else
return true;
}
/**
* Simple constructor that just takes a given log-probability vector as error model.
* Only intended for unit testing, not general usage.
@ -115,23 +163,27 @@ public class ErrorModel {
}
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
/* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele, byte refBase) {
if (DEBUG)
System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
*/
//pileupElement.
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
if (allele.isReference()) {
// for a ref allele, any base mismatch or new indel is a mismatch.
if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
if(allele.getBases().length>0 )
// todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
// could clean up/simplify this when unpadding is removed
return (pileupElement.getBase() == refBase);
else
// either null allele to compare, or ref/alt lengths are different (indel by definition).
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
}
// for non-ref alleles to compare:
if (refAllele.getBases().length == allele.getBases().length)
// alleles have the same length (eg snp or mnp)
return pileupElement.getBase() == allele.getBases()[0];

View File

@ -247,7 +247,8 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
// find the alternate allele(s) that we should be using
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
if (alleles == null || alleles.isEmpty())
return null;
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final int endLoc = getEndLocation(tracker, ref, alleles);
@ -313,7 +314,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi
refLanePileup = refPileup.getPileupForLane(laneID);
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
perLaneErrorModels.put(laneID, new ErrorModel(UAC, refLanePileup, refVC, ref));
}
return perLaneErrorModels;

View File

@ -25,6 +25,8 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
final int eventLength;
double[][] readHaplotypeLikelihoods;
final byte refBase;
public PoolIndelGenotypeLikelihoods(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
@ -38,6 +40,8 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
this.haplotypeMap = haplotypeMap;
this.refContext = referenceContext;
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
// todo - not needed if indel alleles have base at current position
this.refBase = referenceContext.getBase();
}
// -------------------------------------------------------------------------------------
@ -141,7 +145,7 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
final int numHaplotypes = haplotypeMap.size();
final int readCounts[] = new int[pileup.getNumberOfElements()];
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
n = readHaplotypeLikelihoods.length;
} else {
Allele refAllele = null;
@ -161,7 +165,7 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
int idx =0;
for (Allele allele : alleles) {
int cnt = numSeenBases.get(idx);
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele, refBase)?1:0));
}
n++;

View File

@ -70,12 +70,6 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi
}
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
}
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,

View File

@ -37,6 +37,13 @@ public class PoolCallerIntegrationTest extends WalkerTest {
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL",
REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
@Test
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4");
@ -44,7 +51,17 @@ public class PoolCallerIntegrationTest extends WalkerTest {
@Test
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
}
@Test
public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","b66e7150603310fd57ee7bf9fc590706");
}
@Test
public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","ccdae3fc4d2c922f956a186aaad51c29");
}
@Test

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
@ -49,6 +50,13 @@ public class PoolGenotypeLikelihoodsUnitTest {
private static final boolean SIMULATE_NOISY_PILEUP = false;
private static final int NUM_SIMULATED_OBS = 10;
void PoolGenotypeLikelihoodsUnitTest() {
UAC.minQualityScore = 5;
UAC.maxQualityScore = 40;
UAC.phredScaledPrior = (byte)20;
UAC.minPower = 0.0;
}
@Test
public void testStoringLikelihoodElements() {
@ -251,8 +259,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
@Test
public void testErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
final byte minQ = 5;
final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
@ -270,7 +276,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// get artificial alignment context for ref sample - no noise
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
final ErrorModel emodel = new ErrorModel(UAC, refPileup, refVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
@ -287,8 +293,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
@Test
public void testIndelErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
final byte minQ = 5;
final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final String altBases = "TCA";
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
@ -313,7 +317,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
final ErrorModel emodel = new ErrorModel(UAC, refPileup, refInsertionVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
@ -343,7 +347,7 @@ public class PoolGenotypeLikelihoodsUnitTest {
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
final ErrorModel emodel = new ErrorModel(UAC, refPileup, refDeletionVC, refPileupTestProvider.getReferenceContext());
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.indels;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
@ -175,7 +176,8 @@ public class PairHMMIndelErrorModel {
}
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
@Ensures("result != null && result.length == pileup.getNumberOfElements()")
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext ref,
final int eventLength,

View File

@ -90,7 +90,7 @@ public class ArtificialReadPileupTestProvider {
return sampleNames;
}
public byte getRefByte() {
return refBases.substring(offset,offset+1).getBytes()[0];
return referenceContext.getBase();
}
public ReferenceContext getReferenceContext() { return referenceContext;}
@ -107,7 +107,7 @@ public class ArtificialReadPileupTestProvider {
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
Allele refAllele, altAllele;
if (eventLength == 0) {// SNP case
refAllele =Allele.create(refBases.substring(offset,offset+1),true);
refAllele =Allele.create(referenceContext.getBase(),true);
altAllele = Allele.create(altBases.substring(0,1), false);
} else if (eventLength>0){