Next iteration on new UG framework:
- Brought over exact AF estimation from branch (which is now dead). Exact model is default in UnifiedGenotyperV2. - Implemented completely new genotyping algorithm given best AF estimate using dynamic programming, which in theory should be better than both greedy search and any HWE-based genotyper. - Integrated and added new Dindel likelihood estimation model. - Corrected annotators that would call readBasePileup: since we can be annotating extended events, best way is to interrogate context for kind of pileup and either readBasePileup or readExtendedEventPileup. All changes above except last one are still in playground since they require more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4396 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
027241e1ca
commit
ece694d0af
|
|
@ -1,17 +1,23 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.vcf.VCFCompoundHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFFormatHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFCompoundHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnotation {
|
||||
|
|
@ -45,18 +51,19 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot
|
|||
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
|
||||
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
}
|
||||
|
||||
private Map<String,Object> annotateIndel(StratifiedAlignmentContext stratifiedContext, VariantContext vc) {
|
||||
ReadBackedExtendedEventPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup();
|
||||
if ( pileup == null )
|
||||
// ReadBackedExtendedEventPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup();
|
||||
//ReadBackedPileup pileup = stratifiedContext.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
//if ( pileup == null )
|
||||
return null;
|
||||
|
||||
Integer[] counts = new Integer[2];
|
||||
//Integer[] counts = new Integer[2];
|
||||
|
||||
// TODO -- fix me
|
||||
/*
|
||||
|
|
@ -69,9 +76,9 @@ public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnot
|
|||
}
|
||||
*/
|
||||
|
||||
Map<String,Object> map = new HashMap<String,Object>();
|
||||
map.put(getKeyNames().get(0), counts);
|
||||
return map;
|
||||
//Map<String,Object> map = new HashMap<String,Object>();
|
||||
//map.put(getKeyNames().get(0), counts);
|
||||
//return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("AD"); }
|
||||
|
|
|
|||
|
|
@ -1,21 +1,22 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotation {
|
||||
|
|
@ -26,10 +27,18 @@ public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotati
|
|||
|
||||
int mq0 = 0;
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for (PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 )
|
||||
mq0++;
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
ReadBackedPileup pileup = null;
|
||||
if (context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null) {
|
||||
for (PileupElement p : pileup ) {
|
||||
if ( p.getMappingQual() == 0 )
|
||||
mq0++;
|
||||
}
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
|
|
|
|||
|
|
@ -1,17 +1,18 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -24,9 +25,17 @@ public class RMSMappingQuality implements InfoFieldAnnotation, StandardAnnotatio
|
|||
|
||||
ArrayList<Integer> qualities = new ArrayList<Integer>();
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for (PileupElement p : pileup )
|
||||
qualities.add(p.getRead().getMappingQuality());
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
ReadBackedPileup pileup = null;
|
||||
if (context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null) {
|
||||
for (PileupElement p : pileup )
|
||||
qualities.add(p.getRead().getMappingQuality());
|
||||
}
|
||||
}
|
||||
int[] quals = new int[qualities.size()];
|
||||
int index = 0;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
|
|||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFHeaderLineType;
|
||||
import org.broad.tribble.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -10,10 +11,10 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotation {
|
||||
|
|
@ -25,9 +26,17 @@ public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotatio
|
|||
int deletions = 0;
|
||||
int depth = 0;
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
deletions += pileup.getNumberOfDeletions();
|
||||
depth += pileup.size();
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
ReadBackedPileup pileup = null;
|
||||
if (context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null) {
|
||||
deletions += pileup.getNumberOfDeletions();
|
||||
depth += pileup.size();
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
|
@ -203,7 +204,7 @@ public class HaplotypeIndelErrorModel {
|
|||
}
|
||||
|
||||
if (read.getReadName().contains("106880")) {
|
||||
|
||||
|
||||
System.out.println("aca");
|
||||
|
||||
System.out.println("Haplotype:");
|
||||
|
|
@ -305,10 +306,10 @@ public class HaplotypeIndelErrorModel {
|
|||
// workaround for reads whose bases quality = 0,
|
||||
if (readQual < 1)
|
||||
readQual = 1;
|
||||
|
||||
|
||||
double baseProb = QualityUtils.qualToProb(readQual);
|
||||
|
||||
|
||||
|
||||
double pBaseMatch = probToQual(baseProb);
|
||||
double pBaseMismatch = (double)(readQual);
|
||||
|
||||
|
|
@ -372,5 +373,66 @@ public class HaplotypeIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
public double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
|
||||
VariantContext vc, int eventLength){
|
||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||
int i=0;
|
||||
for (SAMRecord read : pileup.getReads()) {
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
readLikelihoods[i][j]= computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength);
|
||||
if (DEBUG) {
|
||||
System.out.print(read.getReadName()+" ");
|
||||
|
||||
System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString(), readLikelihoods[i][j]);
|
||||
}
|
||||
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
for (i=0; i < haplotypesInVC.size(); i++) {
|
||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||
double[] readLikelihood = new double[2]; // diploid sample
|
||||
for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
||||
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
||||
|
||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return haplotypeLikehoodMatrix;
|
||||
|
||||
}
|
||||
|
||||
public static double[] getPosteriorProbabilitesFromHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) {
|
||||
int hSize = haplotypeLikehoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2];
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i < hSize; i++) {
|
||||
for (int j=i; j < hSize; j++){
|
||||
genotypeLikelihoods[k++] = -haplotypeLikehoodMatrix[i][j]/10;
|
||||
}
|
||||
}
|
||||
// normalize likelihoods and pass to linear domain.
|
||||
return MathUtils.normalizeFromLog10(genotypeLikelihoods);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,419 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.CircularArray;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
|
||||
|
||||
@Reference(window=@Window(start=-30,stop=200))
|
||||
@Allows({DataSource.READS, DataSource.REFERENCE})
|
||||
public class SimpleIndelGenotyperWalker extends RefWalker<Integer,Integer> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
@Argument(fullName="maxReadDeletionLength",shortName="maxReadDeletionLength",doc="Max deletion length allowed when aligning reads to candidate haplotypes.",required=false)
|
||||
int maxReadDeletionLength = 3;
|
||||
|
||||
@Argument(fullName="insertionStartProbability",shortName="insertionStartProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
|
||||
double insertionStartProbability = 0.001;
|
||||
|
||||
@Argument(fullName="insertionEndProbability",shortName="insertionEndProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
|
||||
double insertionEndProbability = 0.5;
|
||||
|
||||
@Argument(fullName="alphaDeletionProbability",shortName="alphaDeletionProbability",doc="Assess only sites with coverage at or below the specified value.",required=false)
|
||||
double alphaDeletionProbability = 0.001;
|
||||
|
||||
|
||||
@Argument(fullName="haplotypeSize",shortName="hsize",doc="Size of haplotypes to evaluate calls.",required=false)
|
||||
int HAPLOTYPE_SIZE = 80;
|
||||
|
||||
|
||||
@Argument(fullName="indelHeterozygozity",shortName="indhet",doc="Indel Heterozygosity (assumed 1/6500 from empirical human data)",required=false)
|
||||
double indelHeterozygosity = (double)1.0/6500.0;
|
||||
|
||||
@Argument(fullName="doSimpleCalculationModel",shortName="simple",doc="Use Simple Calculation Model for Pr(Reads | Haplotype)",required=false)
|
||||
boolean doSimple = false;
|
||||
|
||||
@Argument(fullName="enableDebugOutput",shortName="debugout",doc="Output debug data",required=false)
|
||||
boolean DEBUG = false;
|
||||
|
||||
@Argument(fullName="doTrio",shortName="trio",doc="Output 1KG CEU trio genotype data (sampleName ignored)",required=false)
|
||||
boolean doTrio = false;
|
||||
|
||||
@Argument(fullName="useFlatPriors",shortName="flat",doc="If present, use flat priors on haplotypes",required=false)
|
||||
boolean useFlatPriors = false;
|
||||
|
||||
@Argument(fullName="useDynamicHaplotypeSize",shortName="dhsize",doc="If present, use dynamic haplotype size",required=false)
|
||||
boolean useDynamicHaplotypeSize = false;
|
||||
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 10.0;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean generateExtendedEvents() { return true; }
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
|
||||
private HaplotypeIndelErrorModel model;
|
||||
|
||||
|
||||
Set<String> sampleNames;
|
||||
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
|
||||
insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, doSimple, DEBUG);
|
||||
|
||||
sampleNames = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
if (!context.hasBasePileup())
|
||||
return 0;
|
||||
|
||||
|
||||
VariantContext vc = tracker.getVariantContext(ref, "indels", null, context.getLocation(), true);
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return 0;
|
||||
|
||||
|
||||
if (!vc.isIndel())
|
||||
return 0;
|
||||
|
||||
int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length(); // assume only one alt allele for now
|
||||
if (eventLength<0)
|
||||
eventLength = - eventLength;
|
||||
|
||||
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
||||
List<Haplotype> haplotypesInVC = new ArrayList<Haplotype>();
|
||||
|
||||
int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe
|
||||
|
||||
|
||||
if (useDynamicHaplotypeSize) {
|
||||
if (currentHaplotypeSize < minHaplotypeSize)
|
||||
currentHaplotypeSize = minHaplotypeSize;
|
||||
|
||||
boolean doneHaplotype = false;
|
||||
while (!doneHaplotype) {
|
||||
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||
String[] haplotypeStrings = new String[haplotypesInVC.size()];
|
||||
for (int k = 0; k < haplotypeStrings.length; k++)
|
||||
haplotypeStrings[k] = new String(haplotypesInVC.get(k).getBasesAsBytes());
|
||||
|
||||
// check if all strings are the same.
|
||||
for (int k = 1; k < haplotypeStrings.length; k++) {
|
||||
if (!haplotypeStrings[k].equals(haplotypeStrings[0]))
|
||||
doneHaplotype = true;
|
||||
|
||||
}
|
||||
|
||||
if (doneHaplotype)
|
||||
break;
|
||||
else
|
||||
currentHaplotypeSize = 2*currentHaplotypeSize;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
if (currentHaplotypeSize < minHaplotypeSize)
|
||||
throw new UserException.BadArgumentValue("Insufficient haplotype length to represent all indels:",
|
||||
String.valueOf(currentHaplotypeSize));
|
||||
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||
}
|
||||
|
||||
// combine likelihoods for all possible haplotype pair (n*(n-1)/2 combinations)
|
||||
|
||||
double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()];
|
||||
double bestLikelihood = 1e13;
|
||||
// for given allele haplotype, compute likelihood of read pileup given haplotype
|
||||
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
|
||||
|
||||
if (pileup.getSamples().size() > 1) {
|
||||
throw new UserException.BadInput("Currently only a single-sample BAM is supported for Indel genotyper");
|
||||
}
|
||||
|
||||
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
// In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption),
|
||||
// and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of
|
||||
// another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances
|
||||
// by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval
|
||||
// of size L is Poisson-distributed with parameter lambda = theta*L.
|
||||
|
||||
// Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it
|
||||
// by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and
|
||||
// pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda.
|
||||
|
||||
// We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006)
|
||||
// and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)),
|
||||
// taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one.
|
||||
// so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L).
|
||||
// Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval)
|
||||
// and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be
|
||||
// added to ref hypothesis.
|
||||
// Equation above is prior model.
|
||||
|
||||
if (!useFlatPriors) {
|
||||
|
||||
double lambda = (double)HAPLOTYPE_SIZE * indelHeterozygosity;
|
||||
double altPrior = HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
|
||||
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
|
||||
|
||||
haplotypeLikehoodMatrix[0][1] = altPrior;
|
||||
haplotypeLikehoodMatrix[1][1] = 2*altPrior;
|
||||
}
|
||||
|
||||
int bestIndexI =-1, bestIndexJ=-1;
|
||||
double callConfidence = bestLikelihood;
|
||||
|
||||
|
||||
if (pileup.getReads().size() > 0) {
|
||||
double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()];
|
||||
int i=0;
|
||||
for (SAMRecord read : pileup.getReads()) {
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
readLikelihoods[i][j]= model.computeReadLikelihoodGivenHaplotype(haplotypesInVC.get(j), read, vc, eventLength);
|
||||
if (DEBUG) {
|
||||
System.out.print(read.getReadName()+" ");
|
||||
|
||||
System.out.format("%d %d S:%d US:%d E:%d UE:%d C:%s %3.4f\n",i, j, read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString(), readLikelihoods[i][j]);
|
||||
}
|
||||
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
for (i=0; i < haplotypesInVC.size(); i++) {
|
||||
for (int j=i; j < haplotypesInVC.size(); j++){
|
||||
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
|
||||
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
|
||||
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
|
||||
double[] readLikelihood = new double[2]; // diploid sample
|
||||
for (int readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
|
||||
readLikelihood[0] = -readLikelihoods[readIdx][i]/10;
|
||||
readLikelihood[1] = -readLikelihoods[readIdx][j]/10;
|
||||
|
||||
double probRGivenHPair = MathUtils.sumLog10(readLikelihood)/2;
|
||||
haplotypeLikehoodMatrix[i][j] += HaplotypeIndelErrorModel.probToQual(probRGivenHPair);
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (haplotypeLikehoodMatrix[i][j] < bestLikelihood) {
|
||||
bestIndexI = i;
|
||||
bestIndexJ = j;
|
||||
|
||||
callConfidence = bestLikelihood - haplotypeLikehoodMatrix[i][j];
|
||||
bestLikelihood = haplotypeLikehoodMatrix[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//we say that most likely genotype at site is index (i,j) maximizing likelihood matrix
|
||||
String type;
|
||||
if (vc.isDeletion())
|
||||
type = "DEL";
|
||||
else if (vc.isInsertion())
|
||||
type = "INS";
|
||||
else
|
||||
type = "OTH";
|
||||
|
||||
type += vc.getIndelLengths().toString();
|
||||
|
||||
|
||||
out.format("%s %d %s ",vc.getChr(), vc.getStart(), type);
|
||||
|
||||
if (doTrio) {
|
||||
Genotype originalGenotype = vc.getGenotype("NA12878");
|
||||
Genotype dadGenotype = vc.getGenotype("NA12891");
|
||||
Genotype momGenotype = vc.getGenotype("NA12892");
|
||||
|
||||
String dadG, momG, oldG, newG;
|
||||
oldG = getGenotypeString(originalGenotype);
|
||||
dadG = getGenotypeString(dadGenotype);
|
||||
momG = getGenotypeString(momGenotype);
|
||||
|
||||
int x = bestIndexI+bestIndexJ;
|
||||
if (x == 0)
|
||||
newG = "HOMREF";
|
||||
else if (x == 1)
|
||||
newG = "HET";
|
||||
else if (x == 2)
|
||||
newG = "HOMVAR";
|
||||
else if (x < 0)
|
||||
newG = "NOCALL";
|
||||
else
|
||||
newG = "OTHER";
|
||||
|
||||
|
||||
if (callConfidence < STANDARD_CONFIDENCE_FOR_CALLING) {
|
||||
newG = "NOCALL";
|
||||
}
|
||||
out.format("NewG %s OldG %s DadG %s MomG %s\n", newG, oldG, dadG, momG);
|
||||
}
|
||||
else {
|
||||
|
||||
for (String sample: sampleNames) {
|
||||
String oldG, newG;
|
||||
Genotype originalGenotype = vc.getGenotype(sample);
|
||||
oldG = getGenotypeString(originalGenotype);
|
||||
int x = bestIndexI+bestIndexJ;
|
||||
if (x == 0)
|
||||
newG = "HOMREF";
|
||||
else if (x == 1)
|
||||
newG = "HET";
|
||||
else if (x == 2)
|
||||
newG = "HOMVAR";
|
||||
else if (x < 0)
|
||||
newG = "NOCALL";
|
||||
else
|
||||
newG = "OTHER";
|
||||
|
||||
if (callConfidence < STANDARD_CONFIDENCE_FOR_CALLING) {
|
||||
newG = "NOCALL";
|
||||
}
|
||||
|
||||
out.format("NewG %s OldG %s\n", newG, oldG);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if ( context.hasExtendedEventPileup() ) {
|
||||
|
||||
// if we got indels at current position:
|
||||
|
||||
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getPileupWithoutMappingQualityZeroReads();
|
||||
if ( pileup.size() < MIN_COVERAGE ) return 0;
|
||||
|
||||
if ( pileup.getNumberOfDeletions() + pileup.getNumberOfInsertions() > MAX_INDELS ) {
|
||||
// we got too many indel events. Maybe it's even a true event, and what we are looking for are
|
||||
// errors rather than true calls. Hence, we do not need these indels. We have to 1) discard
|
||||
// all remaining indels from the buffer: if they are still in the buffer, they are too close
|
||||
// to the current position; and 2) make sure that the next position at which we attempt to count again is
|
||||
// sufficiently far *after* the current position.
|
||||
// System.out.println("Non countable indel event at "+pileup.getLocation());
|
||||
countableIndelBuffer.clear();
|
||||
coverageBuffer.clear(); // we do not want to count observations (read bases) around non-countable indel as well
|
||||
skipToLoc = GenomeLocParser.createGenomeLoc(pileup.getLocation().getContigIndex(),pileup.getLocation().getStop()+pileup.getMaxDeletionLength()+MIN_DISTANCE+1);
|
||||
// System.out.println("Skip to "+skipToLoc);
|
||||
} else {
|
||||
// pileup does not contain too many indels, we need to store them in the buffer and count them later,
|
||||
// if a non-countable indel event(s) do not show up too soon:
|
||||
countableIndelBuffer.add(pileup);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
return 1; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
private String getGenotypeString(Genotype genotype) {
|
||||
String oldG;
|
||||
if (genotype.isHomRef())
|
||||
oldG = "HOMREF";
|
||||
else if (genotype.isHet())
|
||||
oldG = "HET";
|
||||
else if (genotype.isHomVar())
|
||||
oldG = "HOMVAR";
|
||||
else
|
||||
oldG = "OTHER";
|
||||
|
||||
return oldG;
|
||||
}
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value+sum; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -179,7 +179,8 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
AFMatrix.clear();
|
||||
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( isClearRefCall(GL) ) {
|
||||
// todo - gdebug workaround for crash
|
||||
if (false) { //isClearRefCall(GL) ) {
|
||||
refCalls.add(GL);
|
||||
} else {
|
||||
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||
|
|
@ -206,7 +207,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
}
|
||||
}
|
||||
|
||||
private enum GenotypeType { AA, AB, BB }
|
||||
protected enum GenotypeType { AA, AB, BB }
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
|
|
|||
|
|
@ -26,7 +26,12 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
|
|
@ -37,9 +42,23 @@ import org.broad.tribble.util.variantcontext.Allele;
|
|||
import java.util.*;
|
||||
|
||||
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
private final int maxReadDeletionLength = 3;
|
||||
private final double insertionStartProbability = 1e-3;
|
||||
private final double insertionEndProbability = 0.5;
|
||||
private final double alphaDeletionProbability = 1e-3;
|
||||
private final int HAPLOTYPE_SIZE = 80;
|
||||
|
||||
// todo - the following need to be exposed for command line argument control
|
||||
private final double indelHeterozygosity = 1.0/8000;
|
||||
boolean useFlatPriors = true;
|
||||
boolean DEBUGOUT = false;
|
||||
|
||||
private HaplotypeIndelErrorModel model;
|
||||
|
||||
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
model = new HaplotypeIndelErrorModel(maxReadDeletionLength, insertionStartProbability,
|
||||
insertionEndProbability, alphaDeletionProbability, HAPLOTYPE_SIZE, false, DEBUGOUT);
|
||||
}
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
|
|
@ -48,17 +67,71 @@ public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoo
|
|||
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
// TODO: check to make sure the priors instanceof a valid priors class
|
||||
|
||||
// TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
|
||||
VariantContext vc = tracker.getVariantContext(ref, "indels", null, ref.getLocus(), true);
|
||||
// ignore places where we don't have a variant
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
|
||||
if (!vc.isIndel())
|
||||
return null;
|
||||
|
||||
if ( !(priors instanceof DiploidIndelGenotypePriors) )
|
||||
throw new StingException("Only diploid-based Indel priors are supported in the DINDEL GL model");
|
||||
|
||||
int eventLength = vc.getReference().getBaseString().length() - vc.getAlternateAllele(0).getBaseString().length();
|
||||
// assume only one alt allele for now
|
||||
if (eventLength<0)
|
||||
eventLength = - eventLength;
|
||||
|
||||
int currentHaplotypeSize = HAPLOTYPE_SIZE;
|
||||
List<Haplotype> haplotypesInVC = new ArrayList<Haplotype>();
|
||||
int minHaplotypeSize = Haplotype.LEFT_WINDOW_SIZE + eventLength + 2; // to be safe
|
||||
|
||||
// int numSamples = getNSamples(contexts);
|
||||
haplotypesInVC = Haplotype.makeHaplotypeListFromVariantContextAlleles( vc, ref, currentHaplotypeSize);
|
||||
// For each sample, get genotype likelihoods based on pileup
|
||||
// compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them.
|
||||
// initialize the GenotypeLikelihoods
|
||||
GLs.clear();
|
||||
|
||||
double[][] haplotypeLikehoodMatrix;
|
||||
|
||||
if (useFlatPriors) {
|
||||
priors = new DiploidIndelGenotypePriors();
|
||||
}
|
||||
else
|
||||
priors = new DiploidIndelGenotypePriors(indelHeterozygosity,eventLength,currentHaplotypeSize);
|
||||
|
||||
//double[] priorLikelihoods = priors.getPriors();
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
// TODO: fill me in
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
||||
//GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...));
|
||||
ReadBackedPileup pileup = null;
|
||||
if (context.hasExtendedEventPileup())
|
||||
pileup = context.getExtendedEventPileup();
|
||||
else if (context.hasBasePileup())
|
||||
pileup = context.getBasePileup();
|
||||
|
||||
if (pileup != null ) {
|
||||
haplotypeLikehoodMatrix = model.computeReadHaplotypeLikelihoods( pileup, haplotypesInVC, vc, eventLength);
|
||||
|
||||
|
||||
double[] genotypeLikelihoods = HaplotypeIndelErrorModel.getPosteriorProbabilitesFromHaplotypeLikelihoods( haplotypeLikehoodMatrix);
|
||||
|
||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),vc.getReference(),
|
||||
vc.getAlternateAllele(0),
|
||||
Math.log10(genotypeLikelihoods[0]),Math.log10(genotypeLikelihoods[1]), Math.log10(genotypeLikelihoods[2])));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: return the reference Allele
|
||||
return null;
|
||||
return vc.getReference();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: Sep 30, 2010
|
||||
* Time: 1:47:55 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class DiploidIndelGenotypePriors implements GenotypePriors {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Constants and static information
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public static final double INDEL_HETEROZYGOSITY = 1e-4;
|
||||
|
||||
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Diploid priors
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private double[] priors = null;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public DiploidIndelGenotypePriors() {
|
||||
priors = flatPriors.clone();
|
||||
}
|
||||
|
||||
public DiploidIndelGenotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
|
||||
double varPrior = getHaplotypePriors(indelHeterozygosity, eventLength, haplotypeSize);
|
||||
priors[2] = Math.log10(varPrior*varPrior);
|
||||
priors[1] = Math.log10(2*varPrior*(1-varPrior));
|
||||
priors[0] = Math.log10((1-varPrior)*(1-varPrior));
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
public double getHeterozygosity() { return INDEL_HETEROZYGOSITY; }
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public double getHaplotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
|
||||
// compute prior likelihoods on haplotypes.
|
||||
// In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption),
|
||||
// and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of
|
||||
// another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances
|
||||
// by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval
|
||||
// of size L is Poisson-distributed with parameter lambda = theta*L.
|
||||
|
||||
// Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it
|
||||
// by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and
|
||||
// pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda.
|
||||
|
||||
// We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006)
|
||||
// and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)),
|
||||
// taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one.
|
||||
// so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L).
|
||||
// Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval)
|
||||
// and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be
|
||||
// added to ref hypothesis.
|
||||
// Equation above is prior model.
|
||||
|
||||
double lambda = (double)haplotypeSize * indelHeterozygosity;
|
||||
return HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
|
||||
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -26,6 +26,12 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -35,10 +41,9 @@ import java.io.PrintStream;
|
|||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
private int maxNumIterations = 50;
|
||||
private double tol = 1e-4;
|
||||
private boolean DEBUGOUT = true;
|
||||
|
||||
private boolean DEBUGOUT = false;
|
||||
private boolean SIMPLE_GREEDY_GENOTYPER = false;
|
||||
|
||||
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(N, logger, verboseWriter);
|
||||
}
|
||||
|
|
@ -50,30 +55,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
double[] log10AlleleFrequencyPosteriors) {
|
||||
|
||||
// Math requires linear math to make efficient updates.
|
||||
double[] alleleFrequencyPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPriors);
|
||||
double[] alleleFrequencyPriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPriors);
|
||||
double[] alleleFrequencyPosteriors;
|
||||
|
||||
|
||||
// now that we have genotype likelihoods for each sample, we can start refining allele frequency estimate
|
||||
alleleFrequencyPosteriors = updateAFEstimate(GLs, alleleFrequencyPriors);
|
||||
double meanAF = computeMeanAF(alleleFrequencyPosteriors);
|
||||
|
||||
if (DEBUGOUT)
|
||||
System.out.format("Initial Mean AF: %5.6f\n", meanAF);
|
||||
System.out.format("Mean AF: %5.4f. PVariant: %5.5f\n", meanAF,1.0-alleleFrequencyPosteriors[0]);
|
||||
|
||||
|
||||
for (int numIterations=0; numIterations < maxNumIterations; numIterations++) {
|
||||
double oldMeanAF = meanAF;
|
||||
alleleFrequencyPosteriors = updateAFEstimate(GLs, alleleFrequencyPosteriors);
|
||||
meanAF = computeMeanAF(alleleFrequencyPosteriors);
|
||||
|
||||
if (DEBUGOUT)
|
||||
System.out.format("Mean AF: %5.4f. PVariant: %5.5f\n", meanAF,1.0-alleleFrequencyPosteriors[0]);
|
||||
|
||||
if (Math.abs(meanAF-oldMeanAF) < tol)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
for (int k=0; k < alleleFrequencyPosteriors.length; k++)
|
||||
for (int k=0; k < alleleFrequencyPosteriors.length; k++) {
|
||||
log10AlleleFrequencyPosteriors[k] = Math.log10(alleleFrequencyPosteriors[k]);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private double[] updateAFEstimate(Map<String, BiallelicGenotypeLikelihoods> GLs, double[] alleleFrequencyPriors) {
|
||||
|
|
@ -93,11 +89,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
double den = 2.0*j*(2.0*j-1);
|
||||
|
||||
for (int k=0; k <= 2*j; k++ ) {
|
||||
double tmp = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[2];
|
||||
double tmp = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()];
|
||||
if (k > 0)
|
||||
tmp += (2.0*k)*(2.0*j-k)*YMatrix[j-1][k-1]*genotypeLikelihoods[1];
|
||||
tmp += (2.0*k)*(2.0*j-k)*YMatrix[j-1][k-1]*genotypeLikelihoods[GenotypeType.AB.ordinal()];
|
||||
if (k > 1)
|
||||
tmp += k*(k-1)*YMatrix[j-1][k-2]*genotypeLikelihoods[0];
|
||||
tmp += k*(k-1)*YMatrix[j-1][k-2]*genotypeLikelihoods[GenotypeType.BB.ordinal()];
|
||||
|
||||
YMatrix[j][k] = tmp/den;
|
||||
|
||||
|
|
@ -112,7 +108,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
double[] newAF = new double[alleleFrequencyPriors.length];
|
||||
|
||||
for (int k=0; k <= numChr; k++) {
|
||||
double prod = YMatrix[j][numChr-k] * alleleFrequencyPriors[k];
|
||||
double prod = YMatrix[j][k] * alleleFrequencyPriors[k];
|
||||
newAF[k] = prod;
|
||||
sum += prod;
|
||||
}
|
||||
|
|
@ -135,4 +131,112 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
int bestAFguess) {
|
||||
// first experiment: refine do genotype assignment by Hardy Weinberg equilibrium assumption.
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][bestAFguess+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][bestAFguess+1];
|
||||
|
||||
ArrayList<String> sampleIndices = new ArrayList<String>();
|
||||
int sampleIdx = 0;
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER) {
|
||||
sampleIndices.addAll(GLs.keySet());
|
||||
sampleIdx = GLs.size();
|
||||
}
|
||||
else {
|
||||
|
||||
for (String sample: GLs.keySet()) {
|
||||
sampleIndices.add(sample);
|
||||
|
||||
double[] likelihoods = GLs.get(sample).getLikelihoods();
|
||||
|
||||
for (int k=0; k <= bestAFguess; k++) {
|
||||
|
||||
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
||||
int bestIndex = k;
|
||||
|
||||
if (k>0) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-1;
|
||||
}
|
||||
}
|
||||
|
||||
if (k>1) {
|
||||
double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2];
|
||||
if (m2 > bestMetric) {
|
||||
bestMetric = m2;
|
||||
bestIndex = k-2;
|
||||
}
|
||||
}
|
||||
|
||||
pathMetricArray[sampleIdx+1][k] = bestMetric;
|
||||
tracebackArray[sampleIdx+1][k] = bestIndex;
|
||||
}
|
||||
sampleIdx++;
|
||||
}
|
||||
}
|
||||
|
||||
int startIdx = bestAFguess;
|
||||
for (int k = sampleIdx; k > 0; k--) {
|
||||
int bestGTguess;
|
||||
String sample = sampleIndices.get(k-1);
|
||||
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||
Allele alleleA = GL.getAlleleA();
|
||||
Allele alleleB = GL.getAlleleB();
|
||||
|
||||
if (SIMPLE_GREEDY_GENOTYPER)
|
||||
bestGTguess = Utils.findIndexOfMaxEntry(GLs.get(sample).getLikelihoods());
|
||||
else {
|
||||
int newIdx = tracebackArray[k][startIdx];
|
||||
bestGTguess = startIdx - newIdx;
|
||||
startIdx = newIdx;
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
double qual = 0.0;
|
||||
double[] posteriors = GLs.get(sample).getPosteriors();
|
||||
|
||||
if (bestGTguess == 0) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleA);
|
||||
//qual = posteriors[0] - Math.max(posteriors[1],posteriors[2]);
|
||||
} else if(bestGTguess == 1) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleB);
|
||||
//qual = posteriors[1] - Math.max(posteriors[0],posteriors[2]);
|
||||
|
||||
} else {
|
||||
myAlleles.add(alleleB);
|
||||
myAlleles.add(alleleB);
|
||||
// qual = posteriors[2] - Math.max(posteriors[1],posteriors[0]);
|
||||
}
|
||||
/*
|
||||
if (qual <= 0.0) {
|
||||
qual = 0.0;
|
||||
myAlleles.clear();
|
||||
myAlleles.add(Allele.NO_CALL);
|
||||
// myAlleles.add(Allele.NO_CALL);
|
||||
}
|
||||
*/
|
||||
attributes.put(VCFConstants.GENOTYPE_QUALITY_KEY,String.format("%4.2f", 10*qual));
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
calls.put(sample, new Genotype(sample, myAlleles, qual, null, attributes, false));
|
||||
|
||||
}
|
||||
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -164,7 +164,9 @@ public class UnifiedGenotyperEngine {
|
|||
if ( bestAFguess != 0 ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||
// todo - verify this is OK
|
||||
phredScaledConfidence = (double)QualityUtils.MAX_QUAL_SCORE;
|
||||
//phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
|
|
@ -224,11 +226,22 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
|
||||
// todo - temp fix until we can deal with extended events properly
|
||||
//VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(),
|
||||
loc.getStart()+refAllele.length(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||
ReadBackedPileup pileup = null;
|
||||
if (rawContext.hasExtendedEventPileup())
|
||||
pileup = rawContext.getExtendedEventPileup();
|
||||
else if (rawContext.hasBasePileup())
|
||||
pileup = rawContext.getBasePileup();
|
||||
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc);
|
||||
vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||
|
|
@ -413,8 +426,8 @@ public class UnifiedGenotyperEngine {
|
|||
priors = new DiploidSNPGenotypePriors();
|
||||
break;
|
||||
case DINDEL:
|
||||
// TODO: create indel priors object
|
||||
priors = null;
|
||||
// create flat priors for Indels, actual priors will depend on event length to be genotyped
|
||||
priors = new DiploidIndelGenotypePriors();
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue