From ece694d0af6b384a816f818ebbd718eaff9d4ce4 Mon Sep 17 00:00:00 2001 From: delangel Date: Thu, 30 Sep 2010 21:33:59 +0000 Subject: [PATCH] Next iteration on new UG framework: - Brought over exact AF estimation from branch (which is now dead). Exact model is default in UnifiedGenotyperV2. - Implemented completely new genotyping algorithm given best AF estimate using dynamic programming, which in theory should be better than both greedy search and any HWE-based genotyper. - Integrated and added new Dindel likelihood estimation model. - Corrected annotators that would call readBasePileup: since we can be annotating extended events, best way is to interrogate context for kind of pileup and either readBasePileup or readExtendedEventPileup. All changes above except last one are still in playground since they require more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4396 348d0f76-0448-11de-a6fe-93d51630548a --- .../annotator/DepthPerAlleleBySample.java | 33 +- .../walkers/annotator/MappingQualityZero.java | 25 +- .../walkers/annotator/RMSMappingQuality.java | 19 +- .../walkers/annotator/SpanningDeletions.java | 19 +- .../indels/HaplotypeIndelErrorModel.java | 68 ++- .../walkers/SimpleIndelGenotyperWalker.java | 419 ------------------ .../AlleleFrequencyCalculationModel.java | 5 +- ...elGenotypeLikelihoodsCalculationModel.java | 85 +++- .../genotyper/DiploidIndelGenotypePriors.java | 125 ++++++ .../genotyper/ExactAFCalculationModel.java | 154 +++++-- .../genotyper/UnifiedGenotyperEngine.java | 23 +- 11 files changed, 484 insertions(+), 491 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 0798e751d..d13696aa1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -1,17 +1,23 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.vcf.VCFCompoundHeaderLine; import org.broad.tribble.vcf.VCFFormatHeaderLine; import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFCompoundHeaderLine; -import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; -import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnotation { @@ -45,18 +51,19 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); - + Map map = new HashMap(); map.put(getKeyNames().get(0), counts); return map; } private Map annotateIndel(StratifiedAlignmentContext stratifiedContext, VariantContext vc) { - ReadBackedExtendedEventPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup(); - if ( pileup == null ) +// ReadBackedExtendedEventPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup(); + //ReadBackedPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); + //if ( pileup == null ) return null; - Integer[] counts = new Integer[2]; + //Integer[] counts = new Integer[2]; // TODO -- fix me /* @@ -69,9 +76,9 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot } */ - Map map = new HashMap(); - map.put(getKeyNames().get(0), counts); - return map; + //Map map = new HashMap(); + //map.put(getKeyNames().get(0), counts); + //return map; } public List getKeyNames() { return Arrays.asList("AD"); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index 0441ec819..7382a5bbd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -1,21 +1,22 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.Map; +import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.Arrays; +import java.util.Map; public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotation { @@ -26,10 +27,18 @@ public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotati int mq0 = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() == 0 ) - mq0++; + AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup != null) { + for (PileupElement p : pileup ) { + if ( p.getMappingQual() == 0 ) + mq0++; + } } } Map map = new HashMap(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 0f03a2576..22e52dbd8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -1,17 +1,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; @@ -24,9 +25,17 @@ public class RMSMappingQuality implements InfoFieldAnnotation, StandardAnnotatio ArrayList qualities = new ArrayList(); for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - for (PileupElement p : pileup ) - qualities.add(p.getRead().getMappingQuality()); + AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup != null) { + for (PileupElement p : pileup ) + qualities.add(p.getRead().getMappingQuality()); + } } int[] quals = new int[qualities.size()]; int index = 0; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index 8351d718e..387e32188 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.VCFHeaderLineType; import org.broad.tribble.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -10,10 +11,10 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.Map; +import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.Arrays; +import java.util.Map; public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotation { @@ -25,9 +26,17 @@ public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotatio int deletions = 0; int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - deletions += pileup.getNumberOfDeletions(); - depth += pileup.size(); + AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup != null) { + deletions += pileup.getNumberOfDeletions(); + depth += pileup.size(); + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth)); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index e505b556a..1af6a3f89 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.Arrays; import java.util.List; @@ -203,7 +204,7 @@ public class HaplotypeIndelErrorModel { } if (read.getReadName().contains("106880")) { - + System.out.println("aca"); System.out.println("Haplotype:"); @@ -305,10 +306,10 @@ public class HaplotypeIndelErrorModel { // workaround for reads whose bases quality = 0, if (readQual < 1) readQual = 1; - + double baseProb = QualityUtils.qualToProb(readQual); - + double pBaseMatch = probToQual(baseProb); double pBaseMismatch = (double)(readQual); @@ -372,5 +373,66 @@ public class HaplotypeIndelErrorModel { } + public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List haplotypesInVC, + VariantContext vc, int eventLength){ + double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; + double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; + int i=0; + for (SAMRecord read : pileup.getReads()) { + + // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) + // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent + for (int j=0; j < haplotypesInVC.size(); j++) { + readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength); + if (DEBUG) { + System.out.print(read.getReadName()+" "); + + System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(), + read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), + read.getCigarString(), readLikelihoods[i][j]); + } + + } + i++; + } + + for (i=0; i < haplotypesInVC.size(); i++) { + for (int j=i; j < haplotypesInVC.size(); j++){ + // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] + // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) + //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) + double[] readLikelihood = new double[2]; // diploid sample + for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + readLikelihood[0] = -readLikelihoods[readIdx][i]/10; + readLikelihood[1] = -readLikelihoods[readIdx][j]/10; + + double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; + haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); + + } + + + } + } + + return haplotypeLikehoodMatrix; + + } + + public static double[] getPosteriorProbabilitesFromHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) { + int hSize = haplotypeLikehoodMatrix.length; + double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2]; + + int k=0; + for (int i=0; i < hSize; i++) { + for (int j=i; j < hSize; j++){ + genotypeLikelihoods[k++] = -haplotypeLikehoodMatrix[i][j]/10; + } + } + // normalize likelihoods and pass to linear domain. + return MathUtils.normalizeFromLog10(genotypeLikelihoods); + + } + } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java deleted file mode 100755 index f6bd5de79..000000000 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/SimpleIndelGenotyperWalker.java +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.oneoffprojects.walkers; - -import net.sf.samtools.SAMRecord; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.CircularArray; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotype.Haplotype; -import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.*; - -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; - - -@Reference(window=@Window(start=-30,stop=200)) -@Allows({DataSource.READS, DataSource.REFERENCE}) -public class SimpleIndelGenotyperWalker extends RefWalker { - @Output - PrintStream out; - @Argument(fullName="maxReadDeletionLength",shortName="maxReadDeletionLength",doc="Max deletion length allowed when aligning reads to candidate haplotypes.",required=false) - int maxReadDeletionLength = 3; - - @Argument(fullName="insertionStartProbability",shortName="insertionStartProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) - double insertionStartProbability = 0.001; - - @Argument(fullName="insertionEndProbability",shortName="insertionEndProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) - double insertionEndProbability = 0.5; - - @Argument(fullName="alphaDeletionProbability",shortName="alphaDeletionProbability",doc="Assess only sites with coverage at or below the specified value.",required=false) - double alphaDeletionProbability = 0.001; - - - @Argument(fullName="haplotypeSize",shortName="hsize",doc="Size of haplotypes to evaluate calls.",required=false) - int HAPLOTYPE_SIZE = 80; - - - @Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false) - double indelHeterozygosity = (double)1.0/6500.0; - - @Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false) - boolean doSimple = false; - - @Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false) - boolean DEBUG = false; - - @Argument(fullName="doTrio",shortName="trio",doc="Output 1KG CEU trio genotype data (sampleName ignored)",required=false) - boolean doTrio = false; - - @Argument(fullName="useFlatPriors",shortName="flat",doc="If present, use flat priors on haplotypes",required=false) - boolean useFlatPriors = false; - - @Argument(fullName="useDynamicHaplotypeSize",shortName="dhsize",doc="If present, use dynamic haplotype size",required=false) - boolean useDynamicHaplotypeSize = false; - - - @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0; - - - @Override - public boolean generateExtendedEvents() { return true; } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - - private HaplotypeIndelErrorModel model; - - - Set sampleNames; - - - @Override - public void initialize() { - model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, - insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, doSimple, DEBUG); - - sampleNames = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - } - - - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - if (!context.hasBasePileup()) - return 0; - - - VariantContext vc = tracker.getVariantContext(ref, "indels", null, context.getLocation(), true); - // ignore places where we don't have a variant - if ( vc == null ) - return 0; - - - if (!vc.isIndel()) - return 0; - - int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); // assume only one alt allele for now - if (eventLength<0) - eventLength = - eventLength; - - int currentHaplotypeSize = HAPLOTYPE_SIZE; - List haplotypesInVC = new ArrayList(); - - int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe - - - if (useDynamicHaplotypeSize) { - if (currentHaplotypeSize < minHaplotypeSize) - currentHaplotypeSize = minHaplotypeSize; - - boolean doneHaplotype = false; - while (!doneHaplotype) { - - haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize); - String[] haplotypeStrings = new String[haplotypesInVC.size()]; - for (int k = 0; k < haplotypeStrings.length; k++) - haplotypeStrings[k] = new String(haplotypesInVC.get(k).getBasesAsBytes()); - - // check if all strings are the same. - for (int k = 1; k < haplotypeStrings.length; k++) { - if (!haplotypeStrings[k].equals(haplotypeStrings[0])) - doneHaplotype = true; - - } - - if (doneHaplotype) - break; - else - currentHaplotypeSize = 2*currentHaplotypeSize; - - } - - } - else { - if (currentHaplotypeSize < minHaplotypeSize) - throw new UserException.BadArgumentValue("Insufficient haplotype length to represent all indels:", - String.valueOf(currentHaplotypeSize)); - - haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize); - } - - // combine likelihoods for all possible haplotype pair (n*(n-1)/2 combinations) - - double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; - double bestLikelihood = 1e13; - // for given allele haplotype, compute likelihood of read pileup given haplotype - ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); - - if (pileup.getSamples().size() > 1) { - throw new UserException.BadInput("Currently only a single-sample BAM is supported for Indel genotyper"); - } - - - // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - // In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption), - // and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of - // another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances - // by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval - // of size L is Poisson-distributed with parameter lambda = theta*L. - - // Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it - // by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and - // pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda. - - // We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006) - // and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)), - // taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one. - // so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L). - // Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval) - // and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be - // added to ref hypothesis. - // Equation above is prior model. - - if (!useFlatPriors) { - - double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity; - double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength - + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); - - haplotypeLikehoodMatrix[0][1] = altPrior; - haplotypeLikehoodMatrix[1][1] = 2*altPrior; - } - - int bestIndexI =-1, bestIndexJ=-1; - double callConfidence = bestLikelihood; - - - if (pileup.getReads().size() > 0) { - double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; - int i=0; - for (SAMRecord read : pileup.getReads()) { - - // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) - // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent - for (int j=0; j < haplotypesInVC.size(); j++) { - readLikelihoods[i][j]= model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength); - if (DEBUG) { - System.out.print(read.getReadName()+" "); - - System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString(), readLikelihoods[i][j]); - } - - } - i++; - } - - for (i=0; i < haplotypesInVC.size(); i++) { - for (int j=i; j < haplotypesInVC.size(); j++){ - // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] - // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) - //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - double[] readLikelihood = new double[2]; // diploid sample - for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { - readLikelihood[0] = -readLikelihoods[readIdx][i]/10; - readLikelihood[1] = -readLikelihoods[readIdx][j]/10; - - double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2; - haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair); - - } - - - if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) { - bestIndexI = i; - bestIndexJ = j; - - callConfidence = bestLikelihood - haplotypeLikehoodMatrix[i][j]; - bestLikelihood = haplotypeLikehoodMatrix[i][j]; - } - } - } - } - - //we say that most likely genotype at site is index (i,j) maximizing likelihood matrix - String type; - if (vc.isDeletion()) - type = "DEL"; - else if (vc.isInsertion()) - type = "INS"; - else - type = "OTH"; - - type += vc.getIndelLengths().toString(); - - - out.format("%s %d %s ",vc.getChr(), vc.getStart(), type); - - if (doTrio) { - Genotype originalGenotype = vc.getGenotype("NA12878"); - Genotype dadGenotype = vc.getGenotype("NA12891"); - Genotype momGenotype = vc.getGenotype("NA12892"); - - String dadG, momG, oldG, newG; - oldG = getGenotypeString(originalGenotype); - dadG = getGenotypeString(dadGenotype); - momG = getGenotypeString(momGenotype); - - int x = bestIndexI+bestIndexJ; - if (x == 0) - newG = "HOMREF"; - else if (x == 1) - newG = "HET"; - else if (x == 2) - newG = "HOMVAR"; - else if (x < 0) - newG = "NOCALL"; - else - newG = "OTHER"; - - - if (callConfidence < STANDARD_CONFIDENCE_FOR_CALLING) { - newG = "NOCALL"; - } - out.format("NewG %s OldG %s DadG %s MomG %s\n", newG, oldG, dadG, momG); - } - else { - - for (String sample: sampleNames) { - String oldG, newG; - Genotype originalGenotype = vc.getGenotype(sample); - oldG = getGenotypeString(originalGenotype); - int x = bestIndexI+bestIndexJ; - if (x == 0) - newG = "HOMREF"; - else if (x == 1) - newG = "HET"; - else if (x == 2) - newG = "HOMVAR"; - else if (x < 0) - newG = "NOCALL"; - else - newG = "OTHER"; - - if (callConfidence < STANDARD_CONFIDENCE_FOR_CALLING) { - newG = "NOCALL"; - } - - out.format("NewG %s OldG %s\n", newG, oldG); - } - } - -/* - if ( context.hasExtendedEventPileup() ) { - - // if we got indels at current position: - - ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads(); - if ( pileup.size() < MIN_COVERAGE ) return 0; - - if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) { - // we got too many indel events. Maybe it's even a true event, and what we are looking for are - // errors rather than true calls. Hence, we do not need these indels. We have to 1) discard - // all remaining indels from the buffer: if they are still in the buffer, they are too close - // to the current position; and 2) make sure that the next position at which we attempt to count again is - // sufficiently far *after* the current position. - // System.out.println("Non countable indel event at "+pileup.getLocation()); - countableIndelBuffer.clear(); - coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well - skipToLoc = GenomeLocParser.createGenomeLoc(pileup.getLocation().getContigIndex(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1); - // System.out.println("Skip to "+skipToLoc); - } else { - // pileup does not contain too many indels, we need to store them in the buffer and count them later, - // if a non-countable indel event(s) do not show up too soon: - countableIndelBuffer.add(pileup); - } - return 0; - } - */ - - - return 1; //To change body of implemented methods use File | Settings | File Templates. - } - - private String getGenotypeString(Genotype genotype) { - String oldG; - if (genotype.isHomRef()) - oldG = "HOMREF"; - else if (genotype.isHet()) - oldG = "HET"; - else if (genotype.isHomVar()) - oldG = "HOMVAR"; - else - oldG = "OTHER"; - - return oldG; - } - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return value+sum; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public void onTraversalDone(Integer result) { - } - -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index dbdb60b6d..d63dc5947 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -179,7 +179,8 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { AFMatrix.clear(); for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) { - if ( isClearRefCall(GL) ) { + // todo - gdebug workaround for crash + if (false) { //isClearRefCall(GL) ) { refCalls.add(GL); } else { AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample()); @@ -206,7 +207,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { } } - private enum GenotypeType { AA, AB, BB } + protected enum GenotypeType { AA, AB, BB } protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; protected static class AlleleFrequencyMatrix { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index cb97e6940..5a73a54a2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -26,7 +26,12 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper; import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; @@ -37,9 +42,23 @@ import org.broad.tribble.util.variantcontext.Allele; import java.util.*; public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { + private final int maxReadDeletionLength = 3; + private final double insertionStartProbability = 1e-3; + private final double insertionEndProbability = 0.5; + private final double alphaDeletionProbability = 1e-3; + private final int HAPLOTYPE_SIZE = 80; + + // todo - the following need to be exposed for command line argument control + private final double indelHeterozygosity = 1.0/8000; + boolean useFlatPriors = true; + boolean DEBUGOUT = false; + + private HaplotypeIndelErrorModel model; protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); + model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability, + insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, false, DEBUGOUT); } public Allele getLikelihoods(RefMetaDataTracker tracker, @@ -48,17 +67,71 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo StratifiedAlignmentContext.StratifiedContextType contextType, GenotypePriors priors, Map GLs) { - // TODO: check to make sure the priors instanceof a valid priors class - // TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption + if ( tracker == null ) + return null; + + + VariantContext vc = tracker.getVariantContext(ref, "indels", null, ref.getLocus(), true); + // ignore places where we don't have a variant + if ( vc == null ) + return null; + + + if (!vc.isIndel()) + return null; + + if ( !(priors instanceof DiploidIndelGenotypePriors) ) + throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model"); + + int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); + // assume only one alt allele for now + if (eventLength<0) + eventLength = - eventLength; + + int currentHaplotypeSize = HAPLOTYPE_SIZE; + List haplotypesInVC = new ArrayList(); + int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe + + // int numSamples = getNSamples(contexts); + haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize); + // For each sample, get genotype likelihoods based on pileup + // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. + // initialize the GenotypeLikelihoods + GLs.clear(); + + double[][] haplotypeLikehoodMatrix; + + if (useFlatPriors) { + priors = new DiploidIndelGenotypePriors(); + } + else + priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,currentHaplotypeSize); + + //double[] priorLikelihoods = priors.getPriors(); for ( Map.Entry sample : contexts.entrySet() ) { - // TODO: fill me in + AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - //GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...)); + ReadBackedPileup pileup = null; + if (context.hasExtendedEventPileup()) + pileup = context.getExtendedEventPileup(); + else if (context.hasBasePileup()) + pileup = context.getBasePileup(); + + if (pileup != null ) { + haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength); + + + double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getPosteriorProbabilitesFromHaplotypeLikelihoods( haplotypeLikehoodMatrix); + + GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),vc.getReference(), + vc.getAlternateAllele(0), + Math.log10(genotypeLikelihoods[0]),Math.log10(genotypeLikelihoods[1]), Math.log10(genotypeLikelihoods[2]))); + + } } - // TODO: return the reference Allele - return null; + return vc.getReference(); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java new file mode 100755 index 000000000..5a2b7985d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java @@ -0,0 +1,125 @@ +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Sep 30, 2010 + * Time: 1:47:55 PM + * To change this template use File | Settings | File Templates. + */ +public class DiploidIndelGenotypePriors implements GenotypePriors { + // -------------------------------------------------------------------------------------------------------------- + // + // Constants and static information + // + // -------------------------------------------------------------------------------------------------------------- + public static final double INDEL_HETEROZYGOSITY = 1e-4; + + private final static double[] flatPriors = new double[DiploidGenotype.values().length]; + + // -------------------------------------------------------------------------------------------------------------- + // + // Diploid priors + // + // -------------------------------------------------------------------------------------------------------------- + private double[] priors = null; + + /** + * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype + */ + public DiploidIndelGenotypePriors() { + priors = flatPriors.clone(); + } + + public DiploidIndelGenotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) { + double varPrior = getHaplotypePriors(indelHeterozygosity, eventLength, haplotypeSize); + priors[2] = Math.log10(varPrior*varPrior); + priors[1] = Math.log10(2*varPrior*(1-varPrior)); + priors[0] = Math.log10((1-varPrior)*(1-varPrior)); + + } + + + /** + * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return log10 prior as a double array + */ + public double[] getPriors() { + return priors; + } + + /** + * Returns the prior associated with DiploidGenotype g + * @param g + * @return log10 prior as a double + */ + public double getPrior(DiploidGenotype g) { + return getPriors()[g.ordinal()]; + } + + public double getHeterozygosity() { return INDEL_HETEROZYGOSITY; } + + public boolean validate(boolean throwException) { + try { + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) { + String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i])); + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + public double getHaplotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) { + // compute prior likelihoods on haplotypes. + // In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption), + // and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of + // another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances + // by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval + // of size L is Poisson-distributed with parameter lambda = theta*L. + + // Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it + // by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and + // pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda. + + // We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006) + // and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)), + // taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one. + // so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L). + // Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval) + // and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be + // added to ref hypothesis. + // Equation above is prior model. + + double lambda = (double)haplotypeSize * indelHeterozygosity; + return HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength + + HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5); + + + + } + + + static { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java index a347ab8a7..9122626bd 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -26,6 +26,12 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper; import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -35,10 +41,9 @@ import java.io.PrintStream; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - private int maxNumIterations = 50; - private double tol = 1e-4; - private boolean DEBUGOUT = true; - + private boolean DEBUGOUT = false; + private boolean SIMPLE_GREEDY_GENOTYPER = false; + protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) { super(N, logger, verboseWriter); } @@ -50,30 +55,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double[] log10AlleleFrequencyPosteriors) { // Math requires linear math to make efficient updates. - double[] alleleFrequencyPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPriors); + double[] alleleFrequencyPriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPriors); + double[] alleleFrequencyPosteriors; + - // now that we have genotype likelihoods for each sample, we can start refining allele frequency estimate + alleleFrequencyPosteriors = updateAFEstimate(GLs, alleleFrequencyPriors); double meanAF = computeMeanAF(alleleFrequencyPosteriors); + if (DEBUGOUT) - System.out.format("Initial Mean AF: %5.6f\n", meanAF); + System.out.format("Mean AF: %5.4f. PVariant: %5.5f\n", meanAF,1.0-alleleFrequencyPosteriors[0]); - for (int numIterations=0; numIterations < maxNumIterations; numIterations++) { - double oldMeanAF = meanAF; - alleleFrequencyPosteriors = updateAFEstimate(GLs, alleleFrequencyPosteriors); - meanAF = computeMeanAF(alleleFrequencyPosteriors); - if (DEBUGOUT) - System.out.format("Mean AF: %5.4f. PVariant: %5.5f\n", meanAF,1.0-alleleFrequencyPosteriors[0]); - - if (Math.abs(meanAF-oldMeanAF) < tol) - break; - - } - - for (int k=0; k < alleleFrequencyPosteriors.length; k++) + for (int k=0; k < alleleFrequencyPosteriors.length; k++) { log10AlleleFrequencyPosteriors[k] = Math.log10(alleleFrequencyPosteriors[k]); - + } } private double[] updateAFEstimate(Map GLs, double[] alleleFrequencyPriors) { @@ -93,11 +89,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double den = 2.0*j*(2.0*j-1); for (int k=0; k <= 2*j; k++ ) { - double tmp = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[2]; + double tmp = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; if (k > 0) - tmp += (2.0*k)*(2.0*j-k)*YMatrix[j-1][k-1]*genotypeLikelihoods[1]; + tmp += (2.0*k)*(2.0*j-k)*YMatrix[j-1][k-1]*genotypeLikelihoods[GenotypeType.AB.ordinal()]; if (k > 1) - tmp += k*(k-1)*YMatrix[j-1][k-2]*genotypeLikelihoods[0]; + tmp += k*(k-1)*YMatrix[j-1][k-2]*genotypeLikelihoods[GenotypeType.BB.ordinal()]; YMatrix[j][k] = tmp/den; @@ -112,7 +108,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double[] newAF = new double[alleleFrequencyPriors.length]; for (int k=0; k <= numChr; k++) { - double prod = YMatrix[j][numChr-k] * alleleFrequencyPriors[k]; + double prod = YMatrix[j][k] * alleleFrequencyPriors[k]; newAF[k] = prod; sum += prod; } @@ -135,4 +131,112 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } + protected Map generateCalls(Map contexts, + Map GLs, + int bestAFguess) { + // first experiment: refine do genotype assignment by Hardy Weinberg equilibrium assumption. + HashMap calls = new HashMap(); + + + double[][] pathMetricArray = new double[GLs.size()+1][bestAFguess+1]; + int[][] tracebackArray = new int[GLs.size()+1][bestAFguess+1]; + + ArrayList sampleIndices = new ArrayList(); + int sampleIdx = 0; + + if (SIMPLE_GREEDY_GENOTYPER) { + sampleIndices.addAll(GLs.keySet()); + sampleIdx = GLs.size(); + } + else { + + for (String sample: GLs.keySet()) { + sampleIndices.add(sample); + + double[] likelihoods = GLs.get(sample).getLikelihoods(); + + for (int k=0; k <= bestAFguess; k++) { + + double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0]; + int bestIndex = k; + + if (k>0) { + double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1]; + if (m2 > bestMetric) { + bestMetric = m2; + bestIndex = k-1; + } + } + + if (k>1) { + double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2]; + if (m2 > bestMetric) { + bestMetric = m2; + bestIndex = k-2; + } + } + + pathMetricArray[sampleIdx+1][k] = bestMetric; + tracebackArray[sampleIdx+1][k] = bestIndex; + } + sampleIdx++; + } + } + + int startIdx = bestAFguess; + for (int k = sampleIdx; k > 0; k--) { + int bestGTguess; + String sample = sampleIndices.get(k-1); + BiallelicGenotypeLikelihoods GL = GLs.get(sample); + Allele alleleA = GL.getAlleleA(); + Allele alleleB = GL.getAlleleB(); + + if (SIMPLE_GREEDY_GENOTYPER) + bestGTguess = Utils.findIndexOfMaxEntry(GLs.get(sample).getLikelihoods()); + else { + int newIdx = tracebackArray[k][startIdx]; + bestGTguess = startIdx - newIdx; + startIdx = newIdx; + } + + HashMap attributes = new HashMap(); + ArrayList myAlleles = new ArrayList(); + attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); + double qual = 0.0; + double[] posteriors = GLs.get(sample).getPosteriors(); + + if (bestGTguess == 0) { + myAlleles.add(alleleA); + myAlleles.add(alleleA); + //qual = posteriors[0] - Math.max(posteriors[1],posteriors[2]); + } else if(bestGTguess == 1) { + myAlleles.add(alleleA); + myAlleles.add(alleleB); + //qual = posteriors[1] - Math.max(posteriors[0],posteriors[2]); + + } else { + myAlleles.add(alleleB); + myAlleles.add(alleleB); + // qual = posteriors[2] - Math.max(posteriors[1],posteriors[0]); + } +/* + if (qual <= 0.0) { + qual = 0.0; + myAlleles.clear(); + myAlleles.add(Allele.NO_CALL); +// myAlleles.add(Allele.NO_CALL); + } +*/ + attributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,String.format("%4.2f", 10*qual)); + + GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); + attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); + calls.put(sample, new Genotype(sample, myAlleles, qual, null, attributes, false)); + + } + + + return calls; + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f740a6433..c32fcd371 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -164,7 +164,9 @@ public class UnifiedGenotyperEngine { if ( bestAFguess != 0 ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + // todo - verify this is OK + phredScaledConfidence = (double)QualityUtils.MAX_QUAL_SCORE; + //phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { @@ -224,11 +226,22 @@ public class UnifiedGenotyperEngine { } GenomeLoc loc = refContext.getLocus(); - VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); + + // todo - temp fix until we can deal with extended events properly + //VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); + VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), + loc.getStart()+refAllele.length(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); + if ( annotationEngine != null ) { // first off, we want to use the *unfiltered* context for the annotations - stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); + ReadBackedPileup pileup = null; + if (rawContext.hasExtendedEventPileup()) + pileup = rawContext.getExtendedEventPileup(); + else if (rawContext.hasBasePileup()) + pileup = rawContext.getBasePileup(); + + stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE); Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc); vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element. @@ -413,8 +426,8 @@ public class UnifiedGenotyperEngine { priors = new DiploidSNPGenotypePriors(); break; case DINDEL: - // TODO: create indel priors object - priors = null; + // create flat priors for Indels, actual priors will depend on event length to be genotyped + priors = new DiploidIndelGenotypePriors(); break; default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); }