From c604ed9440babc16d67668f4b30d6f933a7b48d1 Mon Sep 17 00:00:00 2001 From: delangel Date: Wed, 15 Sep 2010 21:53:08 +0000 Subject: [PATCH] Several improvements to new indel genotyper (more to come soon): a) Turns out previous change of centering haplotype around indel was a bad idea. Context to the left of indel is important but not as important as right one, because by definition all alleles start at the same location, so haplotype is the same to the left of indel regardless of allele. So, go back to having a constant size window to the left of event. b) Expand reference context so we can test larger haplotypes. c) Optimize computation of read likelihoods by doing them in linear array instead of in a matrix - no difference in biallelic sites but could be significantly faster in multiallelic sites. d) Bug fix: read alignment wasn't being computed correctly if, a) we were at an insertion, b) read started right at the insertion, c) read CIGAR didn't include insertion - more of these corner conditions are lurking, so a revamped computation of how reads align to candidate haplotypes is in the works. e) Add debug option not to use prior haplotype likelihoods. f) Don't hard-code NA12878 for genotyping, now sample name is a required input argument. g) Bug fix: if there are no reads covering a candidate indel event, just output NO_CALL (didn't notice this in HiSeq, but in P1 data it happens all the time). I need to add a confidence threshold for calling later on. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4291 348d0f76-0448-11de-a6fe-93d51630548a --- .../indels/HaplotypeIndelErrorModel.java | 9 +- .../walkers/SimpleIndelGenotyperWalker.java | 166 ++++++++++++------ .../sting/utils/genotype/Haplotype.java | 2 +- 3 files changed, 119 insertions(+), 58 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index dd3bea0a7..2ad7892e6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.AlignmentBlock; import net.sf.samtools.SAMRecord; +import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; @@ -124,7 +125,8 @@ public class HaplotypeIndelErrorModel { return -10.0*Math.log10(prob); } - public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read) { + public double computeReadLikelihoodGivenHaplotype(Haplotype haplotype, SAMRecord read, + VariantContext vc, int eventLength) { long numStartClippedBases = read.getAlignmentStart() - read.getUnclippedStart(); long numEndClippedBases = read.getUnclippedEnd() - read.getAlignmentEnd(); final long readStartPosition = read.getAlignmentStart(); @@ -154,6 +156,11 @@ public class HaplotypeIndelErrorModel { initialIndexInHaplotype = (int)(readStartPosition-haplotypeStartPosition); } + // special case 1: we're analyzing an insertion and the read starts right at the insertion. + if ((readStartPosition > vc.getStart()-eventLength) && vc.isInsertion() && !haplotype.isReference()) { + initialIndexInHaplotype += eventLength; + } + if (DEBUG) { System.out.println("READ: "+read.getReadName()); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java index db0c234b9..d0d7c64c2 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.Window; -@Reference(window=@Window(start=-100,stop=100)) +@Reference(window=@Window(start=-30,stop=200)) @Allows({DataSource.READS, DataSource.REFERENCE}) public class SimpleIndelGenotyperWalker extends RefWalker { @Output @@ -76,12 +76,21 @@ public class SimpleIndelGenotyperWalker extends RefWalker { @Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false) double indelHeterozygosity = (double)1.0/6500.0; + @Argument(fullName="sampleName",shortName="sample",doc="Sample name to evaluate genotypes in",required=true) + String sampleName; + @Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false) boolean doSimple = false; @Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false) boolean DEBUG = false; + @Argument(fullName="doTrio",shortName="trio",doc="Output 1KG CEU trio genotype data (sampleName ignored)",required=false) + boolean doTrio = false; + + @Argument(fullName="useFlatPriors",shortName="flat",doc="If present, use flat priors on haplotypes",required=false) + boolean useFlatPriors = false; + @Override public boolean generateExtendedEvents() { return true; } @@ -154,49 +163,62 @@ public class SimpleIndelGenotyperWalker extends RefWalker { int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); // assume only one alt allele for now if (eventLength<0) eventLength = - eventLength; - - double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity; - double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength - + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); - haplotypeLikehoodMatrix[0][1] = altPrior; - haplotypeLikehoodMatrix[1][1] = 2*altPrior; + if (!useFlatPriors) { - int bestIndexI =0, bestIndexJ=0; - for (int i=0; i < haplotypesInVC.size(); i++) { - for (int j=i; j < haplotypesInVC.size(); j++){ - // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] - for (SAMRecord read : pileup.getReads()) { - // compute Pr(r | hi, hj) = 1/2*(Pr(r|hi) + Pr(r|hj) + double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity; + double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength + + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); - double readLikelihood[] = new double[2]; - double readLikeGivenH; - readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(i), read); - readLikelihood[0]= -readLikeGivenH/10.0; - if (i != j){ - readLikeGivenH = model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read); - readLikelihood[1] = -readLikeGivenH/10.0; + haplotypeLikehoodMatrix[0][1] = altPrior; + haplotypeLikehoodMatrix[1][1] = 2*altPrior; + } + + int bestIndexI =-1, bestIndexJ=-1; + double callConfidence = 0.0; + + if (pileup.getReads().size() > 0) { + double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; + int i=0; + for (SAMRecord read : pileup.getReads()) { + // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) + // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent + for (int j=0; j < haplotypesInVC.size(); j++) { + readLikelihoods[i][j]= model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength); + if (DEBUG) { + System.out.print(read.getReadName()+" "); + + System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString(), readLikelihoods[i][j]); } - else - readLikelihood[1] = readLikelihood[0]; - - // likelihood is by convention -10*log10(pr(r|h)) - // sumlog10 computes 10^x[0]+10^x[1]+... - double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; - haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); - /*System.out.print(read.getReadName()+" "); - System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f %3.4f\n",i, j, read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString(), probRGivenHPair, haplotypeLikehoodMatrix[i][j]); - */ } + i++; + } + + for (i=0; i < haplotypesInVC.size(); i++) { + for (int j=i; j < haplotypesInVC.size(); j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) + //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) + double[] readLikelihood = new double[2]; // diploid sample + for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + readLikelihood[0] = -readLikelihoods[readIdx][i]/10; + readLikelihood[1] = -readLikelihoods[readIdx][j]/10; + + double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; + haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); + + } - if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) { - bestIndexI = i; - bestIndexJ = j; - bestLikelihood = haplotypeLikehoodMatrix[i][j]; + if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) { + bestIndexI = i; + bestIndexJ = j; + callConfidence = bestLikelihood - haplotypeLikehoodMatrix[i][j]; + bestLikelihood = haplotypeLikehoodMatrix[i][j]; + } } } } @@ -215,30 +237,49 @@ public class SimpleIndelGenotyperWalker extends RefWalker { out.format("%s %d %s ",vc.getChr(), vc.getStart(), type); - Genotype originalGenotype = vc.getGenotype("NA12878"); + if (doTrio) { + Genotype originalGenotype = vc.getGenotype("NA12878"); + Genotype dadGenotype = vc.getGenotype("NA12891"); + Genotype momGenotype = vc.getGenotype("NA12892"); - String oldG, newG; - if (originalGenotype.isHomRef()) - oldG = "HOMREF"; - else if (originalGenotype.isHet()) - oldG = "HET"; - else if (originalGenotype.isHomVar()) - oldG = "HOMVAR"; - else - oldG = "OTHER"; + String dadG, momG, oldG, newG; + oldG = getGenotypeString(originalGenotype); + dadG = getGenotypeString(dadGenotype); + momG = getGenotypeString(momGenotype); - int x = bestIndexI+bestIndexJ; - if (x == 0) - newG = "HOMREF"; - else if (x == 1) - newG = "HET"; - else if (x == 2) - newG = "HOMVAR"; - else - newG = "OTHER"; + int x = bestIndexI+bestIndexJ; + if (x == 0) + newG = "HOMREF"; + else if (x == 1) + newG = "HET"; + else if (x == 2) + newG = "HOMVAR"; + else if (x < 0) + newG = "NOCALL"; + else + newG = "OTHER"; - out.format("NewG %s OldG %s\n", newG, oldG); + out.format("NewG %s OldG %s DadG %s MomG %s\n", newG, oldG, dadG, momG); + } + else { + Genotype originalGenotype = vc.getGenotype(sampleName); + String oldG, newG; + oldG = getGenotypeString(originalGenotype); + int x = bestIndexI+bestIndexJ; + if (x == 0) + newG = "HOMREF"; + else if (x == 1) + newG = "HET"; + else if (x == 2) + newG = "HOMVAR"; + else if (x < 0) + newG = "NOCALL"; + else + newG = "OTHER"; + out.format("NewG %s OldG %s\n", newG, oldG); + + } /* if ( context.hasExtendedEventPileup() ) { @@ -272,6 +313,19 @@ public class SimpleIndelGenotyperWalker extends RefWalker { return 1; //To change body of implemented methods use File | Settings | File Templates. } + private String getGenotypeString(Genotype genotype) { + String oldG; + if (genotype.isHomRef()) + oldG = "HOMREF"; + else if (genotype.isHet()) + oldG = "HET"; + else if (genotype.isHomVar()) + oldG = "HOMVAR"; + else + oldG = "OTHER"; + + return oldG; + } /** * Provide an initial value for reduce computations. * diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java index cb37f12e6..884a26466 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java @@ -111,7 +111,7 @@ public class Haplotype { byte[] refBases = ref.getBases(); - int numPrefBases = haplotypeSize/2; + int numPrefBases = 20; int startIdxInReference = (int)(1+vc.getStart()-numPrefBases-ref.getWindow().getStart()); //int numPrefBases = (int)(vc.getStart()-ref.getWindow().getStart()+1); // indel vc starts one before event