Fare thee well, UGv1. Here come the days UGv2.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4747 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
727dac7b7a
commit
d89e17ec8c
|
|
@ -30,7 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotypePriors;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidSNPGenotypePriors;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
|
@ -68,9 +68,9 @@ public class GATKPaperGenotyper extends LocusWalker<Integer,Long> implements Tre
|
|||
if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case
|
||||
|
||||
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
|
||||
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
||||
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
|
||||
0.01);
|
||||
double likelihoods[] = DiploidSNPGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
||||
DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,
|
||||
0.01);
|
||||
// get the bases and qualities from the pileup
|
||||
byte bases[] = pileup.getBases();
|
||||
byte quals[] = pileup.getQuals();
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
|
|
@ -1,7 +1,31 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
public enum BaseMismatchModel {
|
||||
ONE_STATE,
|
||||
THREE_STATE,
|
||||
EMPIRICAL
|
||||
}
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
|
|||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.DINDEL; }
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||
|
||||
public void initialize() {
|
||||
|
||||
|
|
@ -90,8 +90,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
|
|||
}
|
||||
|
||||
// update the engine
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, null, null);
|
||||
UG_engine.samples = samples;
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(headerLines, samples));
|
||||
|
|
@ -127,7 +126,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
|
|||
// we are missing samples, call them
|
||||
if ( missedSamples.size() > 0 ) {
|
||||
AlignmentContext prunedContext = filterForSamples(context.getBasePileup(), missedSamples);
|
||||
VariantCallContext vcc = UG_engine.runGenotyper(tracker, ref, prunedContext);
|
||||
VariantCallContext vcc = UG_engine.calculateLikelihoodsAndGenotypes(tracker, ref, prunedContext);
|
||||
if ( vcc != null && vcc.vc != null )
|
||||
calls.add(vcc.vc);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
|
|
@ -1,292 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel {
|
||||
|
||||
protected DiploidGenotypeCalculationModel(int N, double heterozygosity) {
|
||||
super(N, heterozygosity);
|
||||
}
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
private HashMap<Byte, AlleleFrequencyMatrix> AFMatrixMap = new HashMap<Byte, AlleleFrequencyMatrix>();
|
||||
|
||||
private enum GenotypeType { REF, HET, HOM }
|
||||
|
||||
protected void initialize(byte ref,
|
||||
Map<String, StratifiedAlignmentContext> contexts,
|
||||
StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
// initialize the GenotypeLikelihoods
|
||||
GLs.clear();
|
||||
AFMatrixMap.clear();
|
||||
|
||||
// for each alternate allele, create a new matrix
|
||||
for ( byte alt : BaseUtils.BASES ) {
|
||||
if ( alt != ref )
|
||||
AFMatrixMap.put(alt, new AlleleFrequencyMatrix(contexts.size()));
|
||||
}
|
||||
|
||||
// use flat priors for GLs
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors();
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = new GenotypeLikelihoods(UAC.baseModel, priors, UAC.defaultPlatform);
|
||||
|
||||
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
|
||||
GLs.put(sample.getKey(), GL);
|
||||
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
// for each alternate allele, fill the matrix
|
||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||
for ( byte alt : BaseUtils.BASES ) {
|
||||
if ( alt != ref ) {
|
||||
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
|
||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
|
||||
AFMatrixMap.get(alt).setLikelihoods(posteriors[refGenotype.ordinal()], posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()], sample.getKey());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
|
||||
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
|
||||
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
log10PofDgivenAFi[baseIndex][0] = matrix.getLikelihoodsOfFrequency();
|
||||
double maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][0];
|
||||
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 1; i <= numFrequencies; i++) {
|
||||
// add one more alternatr allele
|
||||
matrix.incrementFrequency();
|
||||
|
||||
// calculate new likelihoods
|
||||
log10PofDgivenAFi[baseIndex][i] = matrix.getLikelihoodsOfFrequency();
|
||||
|
||||
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
||||
// that subsequent likelihoods won't factor into the confidence score, just quit
|
||||
if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10PofDgivenAFi[baseIndex][i] > LOG10_OPTIMIZATION_EPSILON ) {
|
||||
ignoreAlleleFrequenciesAboveI(i, numFrequencies, baseIndex);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( log10PofDgivenAFi[baseIndex][i] > maxLikelihoodSeen )
|
||||
maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][i];
|
||||
}
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> makeGenotypeCalls(byte ref, byte alt, int frequency, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc) {
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// set up some variables we'll need in the loop
|
||||
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
Allele altAllele = Allele.create(alt, false);
|
||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
|
||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
|
||||
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = matrix.getGenotype(frequency, sample);
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.REF.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(refAllele);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.HET.ordinal() ) {
|
||||
myAlleles.add(refAllele);
|
||||
myAlleles.add(altAllele);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.HOM.ordinal() )
|
||||
myAlleles.add(altAllele);
|
||||
myAlleles.add(altAllele);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
|
||||
// todo -- replace with GenotypeLikelihoods object in Tribble library
|
||||
double[] likelihoods = GLs.get(sample).getLikelihoods();
|
||||
String GL = String.format("%.2f,%.2f,%.2f",
|
||||
likelihoods[refGenotype.ordinal()],
|
||||
likelihoods[hetGenotype.ordinal()],
|
||||
likelihoods[homGenotype.ordinal()]);
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, GL);
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int N; // total frequencies
|
||||
private int frequency; // current frequency
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
this.N = N;
|
||||
frequency = 0;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
for (int i = 0; i < N; i++)
|
||||
indexes[i] = 0;
|
||||
}
|
||||
|
||||
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.REF.ordinal()] = AA;
|
||||
matrix[index][GenotypeType.HET.ordinal()] = AB;
|
||||
matrix[index][GenotypeType.HOM.ordinal()] = BB;
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.HET.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.REF.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.HET.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.HOM.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.HET.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.REF.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.HET.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]);
|
||||
else if ( genotype == GenotypeType.HET.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HET.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,315 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class DiploidGenotypePriors {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Constants and static information
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
|
||||
public static final double CEU_HETEROZYGOSITY = 1e-3;
|
||||
public static final double YRI_HETEROZYGOSITY = 1.0 / 850;
|
||||
|
||||
/**
|
||||
* Default value of the prob of seeing a reference error. Used to calculation the
|
||||
* chance of seeing a true B/C het when the reference is A, which we assume is the product
|
||||
* of the ref error rate and the het. Default value is Q60
|
||||
*/
|
||||
public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is
|
||||
|
||||
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Diploid priors
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private double[] priors = null;
|
||||
|
||||
// todo -- fix me when this issue is resolved
|
||||
public static final boolean requirePriorSumToOne = false;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public DiploidGenotypePriors() {
|
||||
priors = flatPriors.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
|
||||
* base ref
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozygosity
|
||||
*/
|
||||
public DiploidGenotypePriors(byte ref, double heterozygosity, boolean dontPolarize) {
|
||||
if ( dontPolarize ) {
|
||||
double[] vals = heterozygosity2DiploidProbabilities(heterozygosity);
|
||||
priors = getReferenceIndependentPriors(ref, vals[0], vals[1], vals[2]);
|
||||
} else {
|
||||
priors = getReferencePolarizedPriors(ref, heterozygosity, PROB_OF_REFERENCE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
|
||||
* base ref
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozygosity
|
||||
* @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A
|
||||
*/
|
||||
public DiploidGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) {
|
||||
priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
|
||||
*
|
||||
* @param log10Priors
|
||||
*/
|
||||
public DiploidGenotypePriors(double[] log10Priors) {
|
||||
priors = log10Priors.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
|
||||
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static functionality
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
|
||||
* value, as elements 0, 1, and 2 of a double[], respectively
|
||||
*
|
||||
* @param h the heterozygosity [probability of a base being heterozygous]
|
||||
*/
|
||||
@Deprecated
|
||||
public static double[] heterozygosity2DiploidProbabilities(double h) {
|
||||
double[] pdbls = new double[3];
|
||||
|
||||
pdbls[0] = heterozygosity2HomRefProbability(h);
|
||||
pdbls[1] = heterozygosity2HetProbability(h);
|
||||
pdbls[2] = heterozygosity2HomVarProbability(h);
|
||||
return pdbls;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param h
|
||||
* @return
|
||||
*/
|
||||
public static double heterozygosity2HomRefProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
double v = 1.0 - (3.0 * h / 2.0);
|
||||
if (MathUtils.isNegative(v)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HetProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HomVarProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h / 2.0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||
* appropriately.
|
||||
*
|
||||
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
|
||||
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
|
||||
* then this sets the priors to:
|
||||
*
|
||||
* AA = hom-ref
|
||||
* AC = AG = AT = (het - pTriStateGenotype) / 3
|
||||
* CC = GG = TT = hom-var / 3
|
||||
* CG = CT = GT = pTriStateGenotype / 3
|
||||
*
|
||||
* So that we get:
|
||||
*
|
||||
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
|
||||
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
|
||||
* hom-ref + het + hom-var
|
||||
* = 1
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozyosity
|
||||
* @param pRefError
|
||||
*/
|
||||
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
|
||||
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
|
||||
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
|
||||
}
|
||||
|
||||
double pTriStateGenotype = heterozyosity * pRefError;
|
||||
// if ( pTriStateGenotype >= heterozyosity ) {
|
||||
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
|
||||
// }
|
||||
|
||||
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
|
||||
double pHet = heterozygosity2HetProbability(heterozyosity);
|
||||
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
|
||||
|
||||
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
|
||||
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
|
||||
}
|
||||
|
||||
double[] priors = new double[DiploidGenotype.values().length];
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double POfG = 0.0;
|
||||
|
||||
final double nOnRefHets = 3;
|
||||
final double nOffRefHets = 3;
|
||||
final double nHomVars = 3;
|
||||
|
||||
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
|
||||
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
|
||||
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
|
||||
else { POfG = pTriStateGenotype / nOffRefHets; }
|
||||
|
||||
priors[g.ordinal()] = Math.log10(POfG);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||
* appropriately.
|
||||
*
|
||||
* TODO -- delete me
|
||||
*
|
||||
* @param ref
|
||||
* @param priorHomRef
|
||||
* @param priorHet
|
||||
* @param priorHomVar
|
||||
*/
|
||||
@Deprecated
|
||||
public static double[] getReferenceIndependentPriors(byte ref, double priorHomRef, double priorHet, double priorHomVar ) {
|
||||
if ((priorHomRef + priorHet + priorHomVar) != 1) {
|
||||
throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar));
|
||||
}
|
||||
|
||||
double[] priors = new double[DiploidGenotype.values().length];
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int nRefBases = MathUtils.countOccurrences((char)ref, g.toString()); // todo -- fixme
|
||||
double log10POfG = 0.0;
|
||||
|
||||
switch ( nRefBases ) {
|
||||
case 2: // hom-ref
|
||||
log10POfG = Math.log10(priorHomRef);
|
||||
break;
|
||||
case 0: // hom-nonref
|
||||
//log10POfG = Math.log10(priorHomVar / 3);
|
||||
log10POfG = Math.log10(priorHomVar);
|
||||
break;
|
||||
default:
|
||||
//log10POfG = Math.log10(priorHet / 6);
|
||||
log10POfG = Math.log10(priorHet);
|
||||
break;
|
||||
}
|
||||
|
||||
priors[g.ordinal()] = log10POfG;
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -9,7 +34,7 @@ import java.util.EnumMap;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
|
||||
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to manipulate machine platforms
|
||||
|
|
@ -38,8 +63,8 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
|
|||
bind("454", SequencerPlatform.ROCHE454);
|
||||
bind("ILLUMINA", SequencerPlatform.SOLEXA);
|
||||
bind("SOLEXA", SequencerPlatform.SOLEXA);
|
||||
bind("solid", SequencerPlatform.SOLID);
|
||||
bind("abi_solid", SequencerPlatform.SOLID);
|
||||
bind("SOLID", SequencerPlatform.SOLID);
|
||||
bind("ABI_SOLID", SequencerPlatform.SOLID);
|
||||
bind("CG", SequencerPlatform.CG);
|
||||
}
|
||||
|
||||
|
|
@ -253,7 +278,7 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
|
|||
|
||||
if ( pl == SequencerPlatform.UNKNOWN ) {
|
||||
if ( raiseErrorOnUnknownPlatform )
|
||||
throw new RuntimeException("Unknown Sequencer platform for read " + read.format() + "; your BAM file is missing the PL tag for some read groups. Please specify a default platform. See your walker's documentation to accomplish this");
|
||||
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
|
||||
else {
|
||||
pl = defaultPlatform;
|
||||
//System.out.printf("Using default platform %s", pl);
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
|
@ -23,9 +23,9 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||
import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||
|
||||
public class FourBaseLikelihoodsFactory {
|
||||
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
|
||||
|
|
@ -35,9 +35,7 @@ public class FourBaseLikelihoodsFactory {
|
|||
}
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) {
|
||||
if ( m instanceof OneStateErrorProbabilities)
|
||||
return ONE_STATE;
|
||||
else if ( m instanceof ThreeStateErrorProbabilities)
|
||||
if ( m instanceof ThreeStateErrorProbabilities)
|
||||
return THREE_STATE;
|
||||
else if ( m instanceof EmpiricalSubstitutionProbabilities)
|
||||
return EMPIRICAL;
|
||||
|
|
@ -55,7 +53,6 @@ public class FourBaseLikelihoodsFactory {
|
|||
public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m,
|
||||
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
|
||||
switch ( m ) {
|
||||
case ONE_STATE: return new OneStateErrorProbabilities();
|
||||
case THREE_STATE: return new ThreeStateErrorProbabilities();
|
||||
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
|
||||
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
|
||||
|
|
@ -1,348 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods)
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(b | D) = P(b) * P(D | b)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | b) = sum_i log10 P(bi | b)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | b) = 1 - P(error | q1) if bi = b
|
||||
* = P(error | q1) / 3 if bi != b
|
||||
*
|
||||
*
|
||||
*/
|
||||
public abstract class FourBaseProbabilities implements Cloneable {
|
||||
|
||||
protected boolean enableCacheFlag = true;
|
||||
|
||||
//
|
||||
// The fundamental data array associated with 4-base likelihoods
|
||||
//
|
||||
protected double[] log10Likelihoods = null;
|
||||
|
||||
|
||||
/**
|
||||
* If true, lots of output will be generated about the Likelihoods at each site
|
||||
*/
|
||||
private boolean verbose = false;
|
||||
|
||||
/**
|
||||
* Bases with Q scores below this threshold aren't included in the Likelihood calculation
|
||||
*/
|
||||
private int minQScoreToInclude = 0;
|
||||
|
||||
/**
|
||||
* Create a new FourBaseLikelihoods object
|
||||
*/
|
||||
public FourBaseProbabilities() {
|
||||
log10Likelihoods = zeros.clone(); // Likelihoods are all zeros
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
FourBaseProbabilities c = (FourBaseProbabilities)super.clone();
|
||||
c.log10Likelihoods = log10Likelihoods.clone();
|
||||
return c;
|
||||
}
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
verbose = v;
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return verbose;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return minQScoreToInclude;
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
this.minQScoreToInclude = minQScoreToInclude;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||
* @return probs
|
||||
*/
|
||||
public double[] getLog10Likelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with a base
|
||||
* @param base base
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLog10Likelihood(byte base) {
|
||||
return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||
}
|
||||
|
||||
public double getLog10Likelihood(int baseIndex) {
|
||||
return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||
* @return probs
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
double[] probs = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
probs[i] = Math.pow(10, log10Likelihoods[i]);
|
||||
return probs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihoods associated with a base
|
||||
* @param base base
|
||||
* @return likelihoods as a double
|
||||
*/
|
||||
public double getLikelihood(byte base) {
|
||||
return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||
}
|
||||
|
||||
public double getLikelihood(int baseIndex) {
|
||||
return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex]));
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// add() -- the heart of
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param qualityScore base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||
*/
|
||||
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
FourBaseProbabilities fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||
if ( fbl == null )
|
||||
return 0;
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
double likelihood = fbl.getLikelihood(base);
|
||||
log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood;
|
||||
}
|
||||
|
||||
if ( verbose ) {
|
||||
for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); }
|
||||
System.out.println();
|
||||
for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param qualityScore base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
|
||||
*/
|
||||
public FourBaseProbabilities computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
if ( badBase(observedBase) ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
if ( qualityScore > getMinQScoreToInclude() ) {
|
||||
|
||||
FourBaseProbabilities fbl = (FourBaseProbabilities)this.clone();
|
||||
fbl.log10Likelihoods = zeros.clone();
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset);
|
||||
|
||||
if ( verbose ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
|
||||
}
|
||||
|
||||
return fbl;
|
||||
}
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
private boolean badBase(byte observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex]));
|
||||
sum += Math.pow(10, log10Likelihoods[baseIndex]);
|
||||
}
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overlaods this
|
||||
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Validation routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||
String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||
throw new IllegalStateException(String.format("At %s: %s", base, bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Hearty math calculations follow
|
||||
//
|
||||
// -- these should not be messed with unless you know what you are doing
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param qual base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
|
||||
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) {
|
||||
if (qual == 0) { // zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s",
|
||||
observedBase, chromBase, qual, offset, read));
|
||||
}
|
||||
|
||||
double logP;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
|
||||
}
|
||||
|
||||
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset);
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
private final static double[] zeros = new double[BaseUtils.BASES.length];
|
||||
|
||||
static {
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||
|
||||
public class FourBaseProbabilitiesFactory {
|
||||
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final String name) {
|
||||
return valueOf(name);
|
||||
}
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final FourBaseProbabilities m) {
|
||||
if ( m instanceof OneStateErrorProbabilities)
|
||||
return ONE_STATE;
|
||||
else if ( m instanceof ThreeStateErrorProbabilities)
|
||||
return THREE_STATE;
|
||||
else if ( m instanceof EmpiricalSubstitutionProbabilities)
|
||||
return EMPIRICAL;
|
||||
|
||||
throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass());
|
||||
|
||||
}
|
||||
/**
|
||||
* General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models
|
||||
*
|
||||
* @param m model
|
||||
* @param pl default platform
|
||||
* @return new 4-base model
|
||||
*/
|
||||
public static FourBaseProbabilities makeFourBaseLikelihoods(BaseMismatchModel m,
|
||||
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
|
||||
switch ( m ) {
|
||||
case ONE_STATE: return new OneStateErrorProbabilities();
|
||||
case THREE_STATE: return new ThreeStateErrorProbabilities();
|
||||
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
|
||||
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate a genotype given the priors and a pile
|
||||
* of bases and quality scores
|
||||
*/
|
||||
public abstract class GenotypeCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
JOINT_ESTIMATE,
|
||||
DINDEL
|
||||
}
|
||||
|
||||
protected UnifiedArgumentCollection UAC;
|
||||
protected Set<String> samples;
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeCalculationModel object
|
||||
*/
|
||||
protected GenotypeCalculationModel() {}
|
||||
|
||||
|
||||
/**
|
||||
* Initialize the GenotypeCalculationModel object
|
||||
* Assumes that out is not null
|
||||
* @param samples samples in input bam
|
||||
* @param logger logger
|
||||
* @param UAC unified arg collection
|
||||
* @param verboseWriter verbose writer
|
||||
*/
|
||||
protected void initialize(Set<String> samples,
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC,
|
||||
PrintStream verboseWriter) {
|
||||
this.UAC = UAC.clone();
|
||||
this.samples = new TreeSet<String>(samples);
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference base
|
||||
* @param loc GenomeLoc
|
||||
* @param stratifiedContexts stratified alignment contexts
|
||||
* @param priors priors to use for GL
|
||||
*
|
||||
* @return call
|
||||
*/
|
||||
public abstract VariantCallContext callLocus(RefMetaDataTracker tracker,
|
||||
byte ref,
|
||||
GenomeLoc loc,
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts,
|
||||
DiploidGenotypePriors priors);
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
* @param ref reference base
|
||||
* @param loc GenomeLoc
|
||||
* @param stratifiedContexts stratified alignment contexts
|
||||
*
|
||||
* @return call
|
||||
*/
|
||||
public abstract VariantCallContext callExtendedLocus(RefMetaDataTracker tracker,
|
||||
byte[] ref,
|
||||
GenomeLoc loc,
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts);
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
*
|
||||
* @return the dbsnp rod if there is one at this position
|
||||
*/
|
||||
public static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
|
||||
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Set;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
||||
public class GenotypeCalculationModelFactory {
|
||||
|
||||
public static GenotypeCalculationModel.Model getGenotypeCalculationModel(final String name) {
|
||||
return valueOf(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* General and correct way to create GenotypeCalculationModel objects for arbitrary models
|
||||
*
|
||||
* @param samples samples in bam
|
||||
* @param logger logger
|
||||
* @param UAC the unified argument collection
|
||||
* @param verboseWriter verbose writer
|
||||
*
|
||||
* @return model
|
||||
*/
|
||||
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
|
||||
Logger logger,
|
||||
UnifiedArgumentCollection UAC,
|
||||
PrintStream verboseWriter) {
|
||||
GenotypeCalculationModel gcm;
|
||||
switch ( UAC.genotypeModel ) {
|
||||
case JOINT_ESTIMATE:
|
||||
gcm = new DiploidGenotypeCalculationModel(samples.size(), UAC.heterozygosity);
|
||||
break;
|
||||
case DINDEL:
|
||||
throw new UnsupportedOperationException("The Dindel-based genotype likelihoods model is not currently supported");
|
||||
//gcm = new SimpleIndelCalculationModel();
|
||||
//break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
||||
}
|
||||
|
||||
gcm.initialize(samples, logger, UAC, verboseWriter);
|
||||
return gcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,509 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(G | D) = P(G) * P(D | G)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | G) = sum_i log10 P(bi | G)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for homozygous genotypes and for heterozygous genotypes:
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
|
||||
*
|
||||
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
|
||||
*
|
||||
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
|
||||
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
|
||||
* model.
|
||||
*/
|
||||
public class GenotypeLikelihoods implements Cloneable {
|
||||
protected final static int FIXED_PLOIDY = 2;
|
||||
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
|
||||
|
||||
protected boolean enableCacheFlag = true;
|
||||
protected boolean VERBOSE = false;
|
||||
|
||||
//
|
||||
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] log10Likelihoods = null;
|
||||
protected double[] log10Posteriors = null;
|
||||
|
||||
protected DiploidGenotypePriors priors = null;
|
||||
|
||||
protected FourBaseProbabilities fourBaseLikelihoods = null;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
*/
|
||||
public GenotypeLikelihoods(BaseMismatchModel m) {
|
||||
this.priors = new DiploidGenotypePriors();
|
||||
initialize(m, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param pl default platform
|
||||
*/
|
||||
public GenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
this.priors = new DiploidGenotypePriors();
|
||||
initialize(m, pl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param priors priors
|
||||
*/
|
||||
public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors) {
|
||||
this.priors = priors;
|
||||
initialize(m, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param priors priors
|
||||
* @param pl default platform
|
||||
*/
|
||||
public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
this.priors = priors;
|
||||
initialize(m, pl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone();
|
||||
c.priors = priors;
|
||||
c.log10Likelihoods = log10Likelihoods.clone();
|
||||
c.log10Posteriors = log10Posteriors.clone();
|
||||
c.fourBaseLikelihoods = (FourBaseProbabilities)fourBaseLikelihoods.clone();
|
||||
return c;
|
||||
}
|
||||
|
||||
protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(m, pl);
|
||||
setToZero();
|
||||
}
|
||||
|
||||
protected void setToZero() {
|
||||
log10Likelihoods = zeros.clone(); // likelihoods are all zeros
|
||||
log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors
|
||||
}
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
VERBOSE = v;
|
||||
fourBaseLikelihoods.setVerbose(v);
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return VERBOSE;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return fourBaseLikelihoods.getMinQScoreToInclude();
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||
* @return likelihoods array
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with DiploidGenotype g
|
||||
* @param g genotype
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLikelihood(DiploidGenotype g) {
|
||||
return getLikelihoods()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return raw log10 (not-normalized posteriors) as a double array
|
||||
*/
|
||||
public double[] getPosteriors() {
|
||||
return log10Posteriors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the posterior associated with DiploidGenotype g
|
||||
* @param g genotpe
|
||||
* @return raw log10 (not-normalized posteror) as a double
|
||||
*/
|
||||
public double getPosterior(DiploidGenotype g) {
|
||||
return getPosteriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return normalized posterors as a double array
|
||||
*/
|
||||
public double[] getNormalizedPosteriors() {
|
||||
double[] normalized = new double[log10Posteriors.length];
|
||||
double sum = 0.0;
|
||||
|
||||
// for precision purposes, we need to add (or really subtract, since everything is negative)
|
||||
// the largest posterior value from all entries so that numbers don't get too small
|
||||
double maxValue = log10Posteriors[0];
|
||||
for (int i = 1; i < log10Posteriors.length; i++) {
|
||||
if ( maxValue < log10Posteriors[i] )
|
||||
maxValue = log10Posteriors[i];
|
||||
}
|
||||
|
||||
// collect the posteriors
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double posterior = Math.pow(10, getPosterior(g) - maxValue);
|
||||
normalized[g.ordinal()] = posterior;
|
||||
sum += posterior;
|
||||
}
|
||||
|
||||
// normalize
|
||||
for (int i = 0; i < normalized.length; i++)
|
||||
normalized[i] /= sum;
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public DiploidGenotypePriors getPriorObject() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors.getPriors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the priors
|
||||
* @param priors priors
|
||||
*/
|
||||
public void setPriors(DiploidGenotypePriors priors) {
|
||||
this.priors = priors;
|
||||
log10Posteriors = zeros.clone();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int i = g.ordinal();
|
||||
log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g genotype
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple function to overload to control the caching of genotype likelihood outputs.
|
||||
* By default the function trues true -- do enable caching. If you are experimenting with an
|
||||
* complex calcluation of P(B | G) and caching might not work correctly for you, overload this
|
||||
* function and return false, so the super() object won't try to cache your GL calculations.
|
||||
*
|
||||
* @return true if caching should be enabled, false otherwise
|
||||
*/
|
||||
public boolean cacheIsEnabled() {
|
||||
return enableCacheFlag;
|
||||
}
|
||||
|
||||
public void setEnableCacheFlag(boolean enable) {
|
||||
enableCacheFlag = enable;
|
||||
}
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
int n = 0;
|
||||
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() )
|
||||
continue;
|
||||
|
||||
byte base = p.getBase();
|
||||
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
|
||||
n += add(base, qual, p.getRead(), p.getOffset());
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
|
||||
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||
GenotypeLikelihoods gl;
|
||||
if ( cacheIsEnabled() ) {
|
||||
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
|
||||
} else {
|
||||
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
|
||||
}
|
||||
} else {
|
||||
gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||
}
|
||||
|
||||
// for bad bases, there are no likelihoods
|
||||
if ( gl == null )
|
||||
return 0;
|
||||
|
||||
double[] likelihoods = gl.getLikelihoods();
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihood = likelihoods[g.ordinal()];
|
||||
|
||||
if ( VERBOSE ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
log10Likelihoods[g.ordinal()] += likelihood;
|
||||
log10Posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static GenotypeLikelihoods[][][][][][] CACHE = new GenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
|
||||
|
||||
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
|
||||
}
|
||||
|
||||
protected GenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
GenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
|
||||
if ( gl == null )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected GenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
|
||||
GenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected void setCache( GenotypeLikelihoods[][][][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, GenotypeLikelihoods val ) {
|
||||
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
|
||||
cache[m][a][i][j][k][x] = val;
|
||||
}
|
||||
|
||||
protected GenotypeLikelihoods getCache( GenotypeLikelihoods[][][][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
return cache[m][a][i][j][k][x];
|
||||
}
|
||||
|
||||
protected GenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||
if ( fbl == null )
|
||||
return null;
|
||||
|
||||
double[] fbLikelihoods = fbl.getLog10Likelihoods();
|
||||
try {
|
||||
|
||||
GenotypeLikelihoods gl = (GenotypeLikelihoods)this.clone();
|
||||
gl.setToZero();
|
||||
|
||||
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
|
||||
double ploidyAdjustment = log10(FIXED_PLOIDY);
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
|
||||
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
|
||||
double p_base = 0.0;
|
||||
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
|
||||
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
|
||||
double likelihood = log10(p_base);
|
||||
|
||||
gl.log10Likelihoods[g.ordinal()] += likelihood;
|
||||
gl.log10Posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
if ( VERBOSE ) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||
System.out.println();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
return gl;
|
||||
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static int strandIndex(boolean fwdStrand) {
|
||||
return fwdStrand ? 0 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
protected boolean badBase(byte observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (DiploidGenotype g : DiploidGenotype.values()) {
|
||||
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
|
||||
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
|
||||
}
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Validation routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
priors.validate(throwException);
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
String bad = null;
|
||||
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||
} else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) {
|
||||
bad = String.format("Posterior %f is badly formed", log10Posteriors[i]);
|
||||
}
|
||||
|
||||
if ( bad != null ) {
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
final static double[] zeros = new double[DiploidGenotype.values().length];
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
zeros[g.ordinal()] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
public interface GenotypePriors {
|
||||
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
|
|
@ -1,401 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
|
||||
// for use in optimizing the P(D|AF) calculations:
|
||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
private int minAlleleFrequencyToTest;
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
protected double[] log10AlleleFrequencyPriors;
|
||||
|
||||
// the allele frequency posteriors and P(f>0) for each alternate allele
|
||||
protected double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][];
|
||||
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
||||
protected double[] PofFs = new double[BaseUtils.BASES.length];
|
||||
|
||||
// the alternate allele with the largest sum of quality scores
|
||||
protected Byte bestAlternateAllele = null;
|
||||
|
||||
// are we at a 'trigger' track site?
|
||||
protected boolean atTriggerTrack = false;
|
||||
|
||||
// the standard filter to use for calls below the confidence threshold but above the emit threshold
|
||||
protected static final Set<String> filter = new HashSet<String>(1);
|
||||
|
||||
|
||||
protected JointEstimateGenotypeCalculationModel(int N, double heterozygosity) {
|
||||
filter.add(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME);
|
||||
computeAlleleFrequencyPriors(N, heterozygosity);
|
||||
}
|
||||
|
||||
public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
|
||||
int numSamples = getNSamples(contexts);
|
||||
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
|
||||
|
||||
// reset the optimization value
|
||||
minAlleleFrequencyToTest = 0;
|
||||
|
||||
// find the alternate allele with the largest sum of quality scores
|
||||
initializeBestAlternateAllele(ref, contexts);
|
||||
|
||||
// did we trigger on the provided track?
|
||||
atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0;
|
||||
|
||||
// if there are no non-ref bases...
|
||||
if ( bestAlternateAllele == null ) {
|
||||
// if we don't want all bases, then we don't need to calculate genotype likelihoods
|
||||
if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) {
|
||||
VariantCallContext vcc = new VariantCallContext(false);
|
||||
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false);
|
||||
return vcc;
|
||||
}
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
bestAlternateAllele = (byte)(ref != 'A' ? 'A' : 'C');
|
||||
}
|
||||
|
||||
// calculate likelihoods if there are non-ref bases
|
||||
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
|
||||
|
||||
VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
|
||||
|
||||
// technically, at this point our confidence in a reference call isn't accurately
|
||||
// estimated because it didn't take into account samples with no data
|
||||
if ( vcc.vc == null )
|
||||
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true);
|
||||
return vcc;
|
||||
}
|
||||
|
||||
protected int getMinAlleleFrequencyToTest() {
|
||||
return minAlleleFrequencyToTest;
|
||||
}
|
||||
|
||||
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
|
||||
return contexts.size();
|
||||
}
|
||||
|
||||
protected void initializeBestAlternateAllele(byte ref, Map<String, StratifiedAlignmentContext> contexts) {
|
||||
int[] qualCounts = new int[4];
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
// calculate the sum of quality scores for each base
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() )
|
||||
continue;
|
||||
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
|
||||
if ( index >= 0 )
|
||||
qualCounts[index] += p.getQual();
|
||||
}
|
||||
}
|
||||
|
||||
// set the non-ref base with maximum quality score sum
|
||||
int maxCount = 0;
|
||||
bestAlternateAllele = null;
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] > maxCount ) {
|
||||
maxCount = qualCounts[index];
|
||||
bestAlternateAllele = altAllele;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void initialize(byte ref, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
// by default, no initialization is done
|
||||
return;
|
||||
}
|
||||
|
||||
protected void computeAlleleFrequencyPriors(int N, double heterozygosity) {
|
||||
int AFs = (2 * N) + 1;
|
||||
log10AlleleFrequencyPriors = new double[AFs];
|
||||
|
||||
// calculate the allele frequency priors for 1-N
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i < AFs; i++) {
|
||||
double value = heterozygosity / (double)i;
|
||||
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
||||
private void estimateReferenceConfidence(VariantCallContext vcc, Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
|
||||
|
||||
double P_of_ref = 1.0;
|
||||
|
||||
// use the AF=0 prob if it's calculated
|
||||
if ( ignoreCoveredSamples )
|
||||
P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
|
||||
|
||||
// for each sample that we haven't examined yet
|
||||
for ( String sample : samples ) {
|
||||
boolean isCovered = contexts.containsKey(sample);
|
||||
if ( ignoreCoveredSamples && isCovered )
|
||||
continue;
|
||||
|
||||
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
}
|
||||
|
||||
vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
}
|
||||
|
||||
protected void calculateAlleleFrequencyPosteriors(byte ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
|
||||
// initialization
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
alleleFrequencyPosteriors[baseIndex] = new double[frequencyEstimationPoints];
|
||||
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
|
||||
}
|
||||
|
||||
// only calculate for the most likely alternate allele
|
||||
calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType);
|
||||
}
|
||||
|
||||
/********************************************************************************/
|
||||
/*** One or both of the following methods should be overloaded in subclasses ***/
|
||||
/*** so that the correct calculation is made for PofDgivenAFi ***/
|
||||
/********************************************************************************/
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param alt the alt base
|
||||
* @param numFrequencies total number of allele frequencies (2N)
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType which stratification to use
|
||||
*/
|
||||
protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
|
||||
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 0; i <= numFrequencies; i++) {
|
||||
double f = (double)i / (double)(numFrequencies);
|
||||
log10PofDgivenAFi[baseIndex][i] += calculateLog10PofDgivenAFforF(ref, alt, f, contexts, contextType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param alt the alt base
|
||||
* @param f the allele frequency
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType which stratification to use
|
||||
*
|
||||
* @return value of PofDgivenAF for allele frequency f
|
||||
*/
|
||||
protected double calculateLog10PofDgivenAFforF(byte ref, byte alt, double f, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/********************************************************************************/
|
||||
|
||||
/**
|
||||
* @param freqI allele frequency I
|
||||
* @param numFrequencies total number of allele frequencies
|
||||
* @param altBaseIndex the index of the alternate allele
|
||||
*/
|
||||
protected void ignoreAlleleFrequenciesAboveI(int freqI, int numFrequencies, int altBaseIndex) {
|
||||
while ( ++freqI <= numFrequencies )
|
||||
log10PofDgivenAFi[altBaseIndex][freqI] = VALUE_NOT_CALCULATED;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param frequencyEstimationPoints number of allele frequencies
|
||||
*/
|
||||
protected void calculatePofFs(byte ref, int frequencyEstimationPoints) {
|
||||
// only calculate for the most likely alternate allele
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
||||
|
||||
// multiply by null allele frequency priors to get AF posteriors, then normalize
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
|
||||
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
|
||||
|
||||
// calculate p(f>0)
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
||||
sum += alleleFrequencyPosteriors[baseIndex][i];
|
||||
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
||||
}
|
||||
|
||||
/**
|
||||
* @param ref the ref base
|
||||
* @param loc the GenomeLoc
|
||||
* @param frequencyEstimationPoints number of allele frequencies
|
||||
*/
|
||||
protected void printAlleleFrequencyData(byte ref, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||
AFline.append(loc).append("\t");
|
||||
AFline.append(i + "/" + (frequencyEstimationPoints-1) + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/ (frequencyEstimationPoints-1)));
|
||||
AFline.append(String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t");
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
double PofDgivenAF = log10PofDgivenAFi[baseIndex][i];
|
||||
if ( PofDgivenAF == VALUE_NOT_CALCULATED )
|
||||
PofDgivenAF = 0.0;
|
||||
AFline.append(String.format("%.8f\t%.8f\t", PofDgivenAF, alleleFrequencyPosteriors[baseIndex][i]));
|
||||
} else {
|
||||
AFline.append(String.format("%.8f\t%.8f\t", -1.0, -1.0));
|
||||
}
|
||||
}
|
||||
verboseWriter.println(AFline);
|
||||
}
|
||||
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele != ref ) {
|
||||
char base = Character.toLowerCase((char)altAllele);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( MathUtils.compareDoubles(PofFs[baseIndex], 0.0) != 0 ) {
|
||||
double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
phredScaledConfidence = -10.0 * log10PofDgivenAFi[baseIndex][0];
|
||||
verboseWriter.println("P(f>0)_" + base + " = 1");
|
||||
verboseWriter.println("Qscore_" + base + " = " + phredScaledConfidence);
|
||||
verboseWriter.println("LOD_" + base + " = " + phredScaledConfidence);
|
||||
} else {
|
||||
verboseWriter.println("P(f>0)_" + base + " = " + PofFs[baseIndex]);
|
||||
verboseWriter.println("Qscore_" + base + " = " + (QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
verboseWriter.println();
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> makeGenotypeCalls(byte ref, byte alt, int frequency, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc) {
|
||||
// by default, we return no genotypes
|
||||
return new HashMap<String, Genotype>();
|
||||
}
|
||||
|
||||
protected VariantCallContext createCalls(RefMetaDataTracker tracker, byte ref, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||
// only need to look at the most likely alternate allele
|
||||
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
|
||||
|
||||
int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
|
||||
double phredScaledConfidence;
|
||||
if ( bestAFguess != 0 ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i < frequencyEstimationPoints; i++) {
|
||||
if ( log10PofDgivenAFi[indexOfMax][i] == VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += log10PofDgivenAFi[indexOfMax][i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
}
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess) )
|
||||
return new VariantCallContext(passesCallThreshold(phredScaledConfidence));
|
||||
|
||||
// populate the sample-specific data (output it to beagle also if requested)
|
||||
Map<String, Genotype> genotypes = makeGenotypeCalls(ref, bestAlternateAllele, bestAFguess, contexts, loc);
|
||||
|
||||
// next, the variant context data (alleles, attributes, etc.)
|
||||
ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create(ref, true));
|
||||
if ( bestAFguess != 0 )
|
||||
alleles.add(Allele.create(bestAlternateAllele, false));
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
DbSNPFeature dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
if ( !UAC.NO_SLOD ) {
|
||||
// the overall lod
|
||||
double overallLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
|
||||
double overallLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
|
||||
double lod = overallLog10PofF - overallLog10PofNull;
|
||||
|
||||
// set the optimization value for the subsequent strand calculations
|
||||
minAlleleFrequencyToTest = bestAFguess;
|
||||
|
||||
// the forward lod
|
||||
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
double forwardLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
|
||||
double forwardLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
|
||||
|
||||
// the reverse lod
|
||||
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
double reverseLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
|
||||
double reverseLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofNull;
|
||||
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofNull;
|
||||
//logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||
|
||||
// strand score is max bias between forward and reverse strands
|
||||
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
|
||||
// rescale by a factor of 10
|
||||
strandScore *= 10.0;
|
||||
//logger.debug(String.format("SLOD=%f", strandScore));
|
||||
|
||||
attributes.put("SB", Double.valueOf(strandScore));
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContext("UG_SNP_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes);
|
||||
|
||||
return new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence));
|
||||
}
|
||||
|
||||
protected boolean passesEmitThreshold(double conf, int bestAFguess) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
|
||||
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
|
||||
}
|
||||
|
||||
protected boolean passesCallThreshold(double conf) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
|
||||
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
|
||||
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
|
||||
*/
|
||||
public class OneStateErrorProbabilities extends FourBaseProbabilities {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public OneStateErrorProbabilities() { super(); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
return 0; // equivalent to e model
|
||||
}
|
||||
}
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class SimpleIndelCalculationModel extends GenotypeCalculationModel {
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
|
||||
private int MIN_COVERAGE = 6;
|
||||
private double MIN_FRACTION = 0.3;
|
||||
private double MIN_CONSENSUS_FRACTION = 0.7 ;
|
||||
// the previous normal event context
|
||||
// private Map<String, StratifiedAlignmentContext> cachedContext;
|
||||
|
||||
protected SimpleIndelCalculationModel(GenomeLocParser genomeLocParser) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
}
|
||||
|
||||
private int totalIndels = 0;
|
||||
private int totalCoverage = 0;
|
||||
private int bestIndelCount = 0;
|
||||
private String bestEvent = null;
|
||||
|
||||
|
||||
public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
|
||||
// cachedContext = contexts;
|
||||
return null;
|
||||
}
|
||||
|
||||
public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts) {
|
||||
|
||||
totalIndels = 0;
|
||||
totalCoverage = 0;
|
||||
bestIndelCount = 0;
|
||||
bestEvent = null;
|
||||
/*
|
||||
System.out.println("\nReached " + loc + " through an extended event");
|
||||
for (Map.Entry<String,StratifiedAlignmentContext> e : contexts.entrySet()) {
|
||||
System.out.println("Set "+e.getKey());
|
||||
System.out.println(" Context: "+e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()+ " reads");
|
||||
ReadBackedExtendedEventPileup p = e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup();
|
||||
if ( p== null ) System.out.println("EXTENDED PILEUP IS NULL");
|
||||
System.out.println(" Event(s): " + e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup().getEventStringsWithCounts(ref));
|
||||
}
|
||||
*/
|
||||
initializeAlleles(ref, contexts);
|
||||
|
||||
VariantCallContext vcc = new VariantCallContext(false);
|
||||
|
||||
if ( totalIndels == 0 ) return vcc; // this can happen if indel-containing reads get filtered out by the engine
|
||||
|
||||
if ( totalCoverage < MIN_COVERAGE ) return vcc;
|
||||
|
||||
if ( ((double)bestIndelCount)/totalCoverage < MIN_FRACTION ) return vcc;
|
||||
|
||||
if ( ((double)bestIndelCount)/totalIndels < MIN_CONSENSUS_FRACTION ) return vcc;
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>(2);
|
||||
|
||||
if ( bestEvent.charAt(0) == '+') {
|
||||
alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,true) );
|
||||
alleles.add( Allele.create(bestEvent.substring(1), false ));
|
||||
} else {
|
||||
if ( bestEvent.charAt(0) == '-' ) {
|
||||
alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,false) );
|
||||
alleles.add( Allele.create(bestEvent.substring(1), true ));
|
||||
loc = genomeLocParser.setStop(loc, loc.getStop() + bestEvent.length()-1);
|
||||
} else
|
||||
throw new ReviewedStingException("Internal error (probably a bug): event does not conform to expected format: "+ bestEvent);
|
||||
}
|
||||
|
||||
VariantContext vc = new VariantContext("UG_Indel_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, new HashMap<String, Genotype>() /* genotypes */,
|
||||
-1.0 /* log error */, null /* filters */, null /* attributes */);
|
||||
|
||||
vcc = new VariantCallContext(vc,true);
|
||||
vcc.setRefBase(ref[0]);
|
||||
/*
|
||||
if ( totalIndels > 0 ) {
|
||||
System.out.println("Calling: "+bestEvent+" ["+bestIndelCount+"/"+totalIndels+"/"+totalCoverage+"] at "+loc);
|
||||
} else {
|
||||
System.out.println("NO EVENT");
|
||||
}
|
||||
*/
|
||||
//VariantContext vc = new MutableVariantContext("UG_indel_call", loc, alleles, genotypes, phredScaledConfidence/10.0, null, attributes);
|
||||
//return new VariantCallContext(vc, phredScaledConfidence >= CONFIDENCE_THRESHOLD);
|
||||
|
||||
return vcc;
|
||||
}
|
||||
|
||||
protected void initializeAlleles(byte[] ref, Map<String, StratifiedAlignmentContext> contexts) {
|
||||
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
||||
totalCoverage += context.size();
|
||||
|
||||
// calculate the sum of quality scores for each base
|
||||
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup();
|
||||
|
||||
List<Pair<String,Integer>> all_events = pileup.getEventStringsWithCounts(ref);
|
||||
for ( Pair<String,Integer> p : all_events ) {
|
||||
if ( p.second > bestIndelCount ) {
|
||||
bestIndelCount = p.second;
|
||||
bestEvent = p.first;
|
||||
}
|
||||
totalIndels += p.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,10 +1,35 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class ThreeStateErrorProbabilities extends FourBaseProbabilities {
|
||||
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
|
|
@ -32,14 +32,17 @@ import org.broadinstitute.sting.commandline.Hidden;
|
|||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default.", required = false)
|
||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.JOINT_ESTIMATE;
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
|
||||
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the THREE_STATE model for comparison purposes", required = false)
|
||||
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
// control the output
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
|
|
@ -49,16 +52,16 @@ public class UnifiedArgumentCollection {
|
|||
public boolean ALL_BASES_MODE = false;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 50.0;
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 50.0;
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_CALLING = 50.0;
|
||||
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 50.0;
|
||||
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||
public boolean NO_SLOD = false;
|
||||
|
|
@ -76,7 +79,7 @@ public class UnifiedArgumentCollection {
|
|||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public int MIN_BASE_QUALTY_SCORE = 20;
|
||||
public int MIN_BASE_QUALTY_SCORE = 17;
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
|
||||
public int MIN_MAPPING_QUALTY_SCORE = 20;
|
||||
|
|
@ -90,14 +93,11 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
|
||||
public boolean CAP_BASE_QUALITY = false;
|
||||
|
||||
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
||||
uac.genotypeModel = genotypeModel;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.baseModel = baseModel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
|
@ -44,21 +45,18 @@ import java.io.PrintStream;
|
|||
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
|
||||
* multi-sample data. The user can choose from several different incorporated calculation models.
|
||||
*/
|
||||
@Reference(window=@Window(start=-20,stop=20))
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.REFERENCE)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics> {
|
||||
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
|
||||
@Deprecated
|
||||
protected String varout;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
protected PrintStream verboseWriter = null;
|
||||
|
||||
|
|
@ -81,7 +79,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.DINDEL; }
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||
|
||||
/**
|
||||
* Inner class for collecting output statistics from the UG
|
||||
|
|
@ -104,7 +102,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
|
||||
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesCallable); }
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -112,21 +110,23 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, verboseWriter, annotationEngine);
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample, do so
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the writers
|
||||
if ( verboseWriter != null ) {
|
||||
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
header.append("POfDGivenAFFor" + (char)altAllele + "\t");
|
||||
header.append("PosteriorAFFor" + (char)altAllele + "\t");
|
||||
}
|
||||
verboseWriter.println(header);
|
||||
}
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(getHeaderInfo(), UG_engine.samples)) ;
|
||||
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
|
||||
}
|
||||
|
||||
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||
|
|
@ -136,14 +136,24 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
|
||||
// annotation (INFO) fields from UnifiedGenotyper
|
||||
for ( Map.Entry<String, String> dbSet : UG_engine.dbAnnotations.entrySet() )
|
||||
headerInfo.add(new VCFInfoHeaderLine(dbSet.getValue(), 0, VCFHeaderLineType.Flag, (dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ? "dbSNP" : dbSet.getValue()) + " Membership"));
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
|
||||
// also, check to see whether comp rods were included
|
||||
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
|
||||
}
|
||||
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
|
||||
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
|
||||
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
|
||||
}
|
||||
}
|
||||
|
||||
// FORMAT and INFO fields
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.GENOTYPE_LIKELIHOODS_KEY));
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
|
||||
|
||||
// FILTER fields
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
|
||||
|
|
@ -162,7 +172,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
|
|||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
return UG_engine.runGenotyper(tracker, refContext, rawContext);
|
||||
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
|
||||
}
|
||||
|
||||
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||
|
|
|
|||
|
|
@ -26,202 +26,656 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFWriter;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
|
||||
|
||||
public class UnifiedGenotyperEngine {
|
||||
|
||||
public static final String TRIGGER_TRACK_NAME = "trigger";
|
||||
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||
|
||||
protected HashMap<String, String> dbAnnotations = new HashMap<String, String>();
|
||||
|
||||
// the unified argument collection
|
||||
protected UnifiedArgumentCollection UAC = null;
|
||||
private UnifiedArgumentCollection UAC = null;
|
||||
|
||||
// the annotation engine
|
||||
protected VariantAnnotatorEngine annotationEngine;
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
protected ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
|
||||
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
|
||||
|
||||
// the various loggers and writers
|
||||
protected Logger logger = null;
|
||||
protected VCFWriter vcfWriter = null;
|
||||
protected PrintStream verboseWriter = null;
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private double[] log10AlleleFrequencyPriors;
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
|
||||
|
||||
// the priors object
|
||||
private GenotypePriors genotypePriors;
|
||||
|
||||
// samples in input
|
||||
protected Set<String> samples = new HashSet<String>();
|
||||
private Set<String> samples = new TreeSet<String>();
|
||||
|
||||
// the various loggers and writers
|
||||
private Logger logger = null;
|
||||
private PrintStream verboseWriter = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence for long reads
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// number of chromosomes (2 * samples) in input
|
||||
private int N;
|
||||
|
||||
// the standard filter to use for calls below the confidence threshold but above the emit threshold
|
||||
private static final Set<String> filter = new HashSet<String>(1);
|
||||
|
||||
private static final int MISMATCH_WINDOW_SIZE = 20;
|
||||
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
initialize(toolkit, UAC, null, null, null, null);
|
||||
// get the number of samples
|
||||
// if we're supposed to assume a single sample, do so
|
||||
samples = new TreeSet<String>();
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
|
||||
|
||||
initialize(toolkit, UAC, null, null, null, samples.size());
|
||||
}
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) {
|
||||
initialize(toolkit, UAC, logger, genotypeWriter, verboseWriter, engine);
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
|
||||
this.samples = new TreeSet<String>(samples);
|
||||
initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
|
||||
}
|
||||
|
||||
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) {
|
||||
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||
this.UAC = UAC;
|
||||
this.logger = logger;
|
||||
this.vcfWriter = genotypeWriter;
|
||||
this.verboseWriter = verboseWriter;
|
||||
this.annotationEngine = engine;
|
||||
|
||||
// deal with input errors
|
||||
if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) {
|
||||
// the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name
|
||||
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads; please run again without the -nt argument");
|
||||
}
|
||||
N = 2 * numSamples;
|
||||
log10AlleleFrequencyPriors = new double[N+1];
|
||||
computeAlleleFrequencyPriors(N);
|
||||
genotypePriors = createGenotypePriors(UAC);
|
||||
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample, do so
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
this.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
// check to see whether comp rods were included
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
dbAnnotations.put(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, VCFConstants.DBSNP_KEY);
|
||||
}
|
||||
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
|
||||
dbAnnotations.put(source.getName(), source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()));
|
||||
}
|
||||
}
|
||||
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
* Compute full calls at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
|
||||
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||
if ( gcm.get() == null ) {
|
||||
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, verboseWriter));
|
||||
}
|
||||
|
||||
byte ref = refContext.getBase();
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
VariantCallContext call;
|
||||
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
|
||||
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
vcc.vc = GLsToPLs(vcc.vc);
|
||||
return vcc;
|
||||
}
|
||||
|
||||
if ( rawContext.hasExtendedEventPileup() ) {
|
||||
/**
|
||||
* Compute GLs at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantContext object
|
||||
*/
|
||||
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
return GLsToPLs(vc);
|
||||
}
|
||||
|
||||
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
|
||||
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) {
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
// filter the context based on min mapping quality
|
||||
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on bad mates and mismatch rate
|
||||
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
call = gcm.get().callExtendedLocus(tracker, refContext.getBasesAtLocus(-1), rawContext.getLocation(), stratifiedContexts);
|
||||
|
||||
} else {
|
||||
|
||||
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||
|
||||
// filter the context based on min base and mapping qualities
|
||||
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on bad mates and mismatch rate
|
||||
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.DINDEL &&
|
||||
isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_REFERENCE_ERROR);
|
||||
call = gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
|
||||
|
||||
// annotate the call, if possible
|
||||
if ( call != null && call.vc != null && annotationEngine != null ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
|
||||
call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||
}
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( glcm.get() == null ) {
|
||||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
if ( call != null && call.vc != null ) {
|
||||
call.setRefBase(ref);
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
|
||||
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs);
|
||||
|
||||
// if the site was downsampled, record that fact
|
||||
if ( rawContext.hasPileupBeenDownsampled() ) {
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(call.vc.getAttributes());
|
||||
attrs.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||
call.vc = VariantContext.modifyAttributes(call.vc, attrs);
|
||||
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
|
||||
}
|
||||
|
||||
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
// no-call everyone for now
|
||||
List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
Set<Allele> alleles = new HashSet<Allele>();
|
||||
alleles.add(refAllele);
|
||||
boolean addedAltAllele = false;
|
||||
|
||||
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( !addedAltAllele ) {
|
||||
addedAltAllele = true;
|
||||
alleles.add(GL.getAlleleA());
|
||||
alleles.add(GL.getAlleleB());
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
|
||||
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
||||
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
|
||||
|
||||
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
int endLoc = calculateEndPos(alleles, refAllele, loc);
|
||||
|
||||
return new VariantContext("UG_call",
|
||||
loc.getContig(),
|
||||
loc.getStart(),
|
||||
endLoc,
|
||||
alleles,
|
||||
genotypes,
|
||||
VariantContext.NO_NEG_LOG_10PERROR,
|
||||
null,
|
||||
null);
|
||||
}
|
||||
|
||||
private static VariantContext GLsToPLs(VariantContext vc) {
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
|
||||
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
|
||||
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
|
||||
}
|
||||
return VariantContext.modifyGenotypes(vc, calls);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute genotypes at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param vc the GL-annotated variant context
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
}
|
||||
|
||||
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( afcm.get() == null ) {
|
||||
log10AlleleFrequencyPosteriors.set(new double[N+1]);
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
}
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( vc.getNSamples() == 0 )
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
|
||||
|
||||
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// find the most likely frequency
|
||||
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// calculate p(f>0)
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
sum += normalizedPosteriors[i];
|
||||
double PofF = Math.min(sum, 1.0); // deal with precision errors
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( bestAFguess != 0 ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += log10AlleleFrequencyPosteriors.get()[i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
}
|
||||
|
||||
// did we trigger on the provided track?
|
||||
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
|
||||
}
|
||||
|
||||
// create the genotypes
|
||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
DbSNPFeature dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
// if the site was downsampled, record that fact
|
||||
if ( rawContext.hasPileupBeenDownsampled() )
|
||||
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||
|
||||
|
||||
if ( !UAC.NO_SLOD && bestAFguess != 0 ) {
|
||||
final boolean DEBUG_SLOD = false;
|
||||
|
||||
// the overall lod
|
||||
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
|
||||
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
|
||||
if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||
|
||||
// strand score is max bias between forward and reverse strands
|
||||
double strandScore = Math.max(forwardLod, reverseLod);
|
||||
// rescale by a factor of 10
|
||||
strandScore *= 10.0;
|
||||
//logger.debug(String.format("SLOD=%f", strandScore));
|
||||
|
||||
attributes.put("SB", Double.valueOf(strandScore));
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
|
||||
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
|
||||
|
||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
||||
vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
ReadBackedPileup pileup = null;
|
||||
if (rawContext.hasExtendedEventPileup())
|
||||
pileup = rawContext.getExtendedEventPileup();
|
||||
else if (rawContext.hasBasePileup())
|
||||
pileup = rawContext.getBasePileup();
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
|
||||
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
|
||||
}
|
||||
|
||||
VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
|
||||
call.setRefBase(refContext.getBase());
|
||||
return call;
|
||||
}
|
||||
|
||||
private int calculateEndPos(Set<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||
// TODO - temp fix until we can deal with extended events properly
|
||||
// for indels, stop location is one more than ref allele length
|
||||
boolean isSNP = true;
|
||||
for (Allele a : alleles){
|
||||
if (a.getBaseString().length() != 1) {
|
||||
isSNP = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int endLoc = loc.getStart();
|
||||
if ( !isSNP )
|
||||
endLoc += refAllele.length();
|
||||
|
||||
return endLoc;
|
||||
}
|
||||
|
||||
private static boolean isValidDeletionFraction(double d) {
|
||||
return ( d >= 0.0 && d <= 1.0 );
|
||||
}
|
||||
|
||||
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
|
||||
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
|
||||
|
||||
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
|
||||
|
||||
// filter the context based on min mapping quality
|
||||
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
byte ref = refContext.getBase();
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
// filter the reads (and test for bad pileups)
|
||||
if ( !filterPileup(stratifiedContexts, badReadPileupFilter) )
|
||||
return null;
|
||||
}
|
||||
|
||||
return stratifiedContexts;
|
||||
}
|
||||
|
||||
protected static void clearAFarray(double[] AFs) {
|
||||
for ( int i = 0; i < AFs.length; i++ )
|
||||
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
}
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
|
||||
double P_of_ref = initialPofRef;
|
||||
|
||||
// for each sample that we haven't examined yet
|
||||
for ( String sample : samples ) {
|
||||
boolean isCovered = contexts.containsKey(sample);
|
||||
if ( ignoreCoveredSamples && isCovered )
|
||||
continue;
|
||||
|
||||
|
||||
int depth = 0;
|
||||
|
||||
if (isCovered) {
|
||||
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
||||
if (context.hasBasePileup())
|
||||
depth = context.getBasePileup().size();
|
||||
else if (context.hasExtendedEventPileup())
|
||||
depth = context.getExtendedEventPileup().size();
|
||||
}
|
||||
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
}
|
||||
|
||||
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
}
|
||||
|
||||
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
|
||||
Allele refAllele = null, altAllele = null;
|
||||
for ( Allele allele : vc.getAlleles() ) {
|
||||
if ( allele.isReference() )
|
||||
refAllele = allele;
|
||||
else
|
||||
altAllele = allele;
|
||||
}
|
||||
|
||||
for (int i = 0; i <= N; i++) {
|
||||
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||
AFline.append(pos);
|
||||
AFline.append("\t");
|
||||
AFline.append(refAllele);
|
||||
AFline.append("\t");
|
||||
if ( altAllele != null )
|
||||
AFline.append(altAllele);
|
||||
else
|
||||
AFline.append("N/A");
|
||||
AFline.append("\t");
|
||||
AFline.append(i + "/" + N + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
|
||||
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
|
||||
AFline.append("0.00000000\t");
|
||||
else
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
|
||||
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
|
||||
verboseWriter.println(AFline.toString());
|
||||
}
|
||||
|
||||
verboseWriter.println("P(f>0) = " + PofF);
|
||||
verboseWriter.println("Qscore = " + phredScaledConfidence);
|
||||
verboseWriter.println();
|
||||
}
|
||||
|
||||
private boolean filterPileup(Map<String, StratifiedAlignmentContext> stratifiedContexts, BadBaseFilter badBaseFilter) {
|
||||
int numDeletions = 0, pileupSize = 0;
|
||||
|
||||
for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) {
|
||||
ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
|
||||
if ( p.isDeletion() ) {
|
||||
// if it's a good read, count it
|
||||
if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
|
||||
(UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) )
|
||||
numDeletions++;
|
||||
} else {
|
||||
if ( !(read instanceof GATKSAMRecord) )
|
||||
throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass());
|
||||
GATKSAMRecord GATKrecord = (GATKSAMRecord)read;
|
||||
GATKrecord.setGoodBases(badBaseFilter, true);
|
||||
if ( GATKrecord.isGoodBase(p.getOffset()) )
|
||||
pileupSize++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// now, test for bad pileups
|
||||
|
||||
// in all_bases mode, it doesn't matter
|
||||
if ( UAC.ALL_BASES_MODE )
|
||||
return true;
|
||||
|
||||
// is there no coverage?
|
||||
if ( pileupSize == 0 )
|
||||
return false;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters low quality reads out of the pileup.
|
||||
* Filters low quality bases out of the SAMRecords.
|
||||
*/
|
||||
private class BadReadPileupFilter implements PileupElementFilter {
|
||||
private class BadBaseFilter implements GATKSAMRecordFilter {
|
||||
private ReferenceContext refContext;
|
||||
private final UnifiedArgumentCollection UAC;
|
||||
|
||||
public BadReadPileupFilter(ReferenceContext refContext) { this.refContext = refContext; }
|
||||
public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) {
|
||||
this.refContext = refContext;
|
||||
this.UAC = UAC;
|
||||
}
|
||||
|
||||
public boolean allow(PileupElement pileupElement) {
|
||||
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
|
||||
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
|
||||
public BitSet getGoodBases(final GATKSAMRecord record) {
|
||||
// all bits are set to false by default
|
||||
BitSet bitset = new BitSet(record.getReadLength());
|
||||
|
||||
// if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue
|
||||
if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE ||
|
||||
(!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) {
|
||||
return bitset;
|
||||
}
|
||||
|
||||
byte[] quals = record.getBaseQualities();
|
||||
for (int i = 0; i < quals.length; i++) {
|
||||
if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE )
|
||||
bitset.set(i);
|
||||
}
|
||||
|
||||
// if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome)
|
||||
if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) {
|
||||
GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength()));
|
||||
byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases();
|
||||
StringUtil.toUpperCase(bases);
|
||||
refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases);
|
||||
}
|
||||
|
||||
BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE);
|
||||
if ( mismatches != null )
|
||||
bitset.and(mismatches);
|
||||
|
||||
return bitset;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
*
|
||||
* @return the dbsnp rod if there is one at this position
|
||||
*/
|
||||
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
|
||||
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
}
|
||||
|
||||
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
|
||||
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
|
||||
}
|
||||
|
||||
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
|
||||
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
|
||||
}
|
||||
|
||||
protected void computeAlleleFrequencyPriors(int N) {
|
||||
// calculate the allele frequency priors for 1-N
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
double value = UAC.heterozygosity / (double)i;
|
||||
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
||||
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
|
||||
GenotypePriors priors;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
// use flat priors for GLs
|
||||
priors = new DiploidSNPGenotypePriors();
|
||||
break;
|
||||
case DINDEL:
|
||||
// create flat priors for Indels, actual priors will depend on event length to be genotyped
|
||||
priors = new DiploidIndelGenotypePriors();
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
|
||||
GenotypeLikelihoodsCalculationModel glcm;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
case DINDEL:
|
||||
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
AlleleFrequencyCalculationModel afcm;
|
||||
switch ( UAC.AFmodel ) {
|
||||
case EXACT:
|
||||
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
|
||||
break;
|
||||
case GRID_SEARCH:
|
||||
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return afcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
|
|
|
|||
|
|
@ -389,7 +389,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
|
|||
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
if ( !BaseUtils.isRegularBase(ref.getBase()) )
|
||||
return false;
|
||||
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
|
||||
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
|
||||
if ( calls == null || calls.vc == null)
|
||||
return false;
|
||||
return ( calls.vc.getNSamples() > 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold );
|
||||
|
|
|
|||
|
|
@ -559,7 +559,7 @@ public class MendelianViolationClassifier extends LocusWalker<MendelianViolation
|
|||
int conf = 0;
|
||||
Allele alt = null;
|
||||
for ( Map.Entry<String,StratifiedAlignmentContext> sEntry : strat.entrySet() ) {
|
||||
VariantCallContext call = engine.runGenotyper(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE));
|
||||
VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE));
|
||||
if ( call != null && call.confidentlyCalled && call.vc != null ) {
|
||||
if ( call.vc.isSNP() ) {
|
||||
if ( ! call.vc.getAlternateAllele(0).basesMatch(var.getAlternateAllele(0))) {
|
||||
|
|
|
|||
|
|
@ -38,7 +38,6 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Hashtable;
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
|
@ -115,7 +114,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
|
|||
//}
|
||||
|
||||
//Calculate posterior probabilities
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
|
||||
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
|
||||
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
|
||||
|
||||
//Check for bad bases and ensure mapping quality
|
||||
|
|
|
|||
|
|
@ -353,7 +353,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
}
|
||||
|
||||
//Calculate posterior probabilities
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
|
||||
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
|
||||
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
|
||||
|
||||
//Check for bad bases and ensure mapping quality myself. This works.
|
||||
|
|
|
|||
|
|
@ -200,7 +200,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
|
|||
}
|
||||
|
||||
private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
|
||||
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
|
||||
if ( calls == null || calls.vc == null || calls.vc.getNSamples() == 0 || !calls.vc.isSNP() )
|
||||
return null;
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -118,7 +118,7 @@ public class SnpCallRateByCoverageWalker extends LocusWalker<List<String>, Strin
|
|||
|
||||
AlignmentContext subContext = new AlignmentContext(context.getLocation(), new ReadBackedPileupImpl(context.getLocation(),sub_reads, sub_offsets));
|
||||
|
||||
VariantCallContext calls = UG.runGenotyper(tracker, ref, subContext);
|
||||
VariantCallContext calls = UG.calculateLikelihoodsAndGenotypes(tracker, ref, subContext);
|
||||
|
||||
if (calls != null && calls.vc != null && calls.vc.getNSamples() > 0 && calls.confidentlyCalled) {
|
||||
Genotype evCall = calls.vc.getGenotype(0);
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public class ValidationGenotyper extends LocusWalker<ValidationGenotyper.Counted
|
|||
}
|
||||
}
|
||||
else if( evalStatus == VARIANT_STATUS.CALLED && compStatus == VARIANT_STATUS.MISSING ) {
|
||||
VariantCallContext call = engine.runGenotyper(tracker, ref, context);
|
||||
VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
||||
if( call != null && call.confidentlyCalled && call.vc != null && call.vc.getType() == VariantContext.Type.NO_VARIATION ) {
|
||||
counter.numFP = 1L;
|
||||
if( printStream!= null ) {
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
|
|||
double altBalance = ((double) altCount)/((double) totalCount);
|
||||
|
||||
if (altBalance > 0.70) {
|
||||
VariantCallContext ugResult = ug.runGenotyper(tracker, ref, context);
|
||||
VariantCallContext ugResult = ug.calculateLikelihoodsAndGenotypes(tracker, ref, context);
|
||||
|
||||
if (ugResult != null && ugResult.vc != null && ugResult.vc.getNSamples() > 0) {
|
||||
return ugResult.vc.getGenotype(0).isHet();
|
||||
|
|
|
|||
|
|
@ -1,32 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
public enum BaseMismatchModel {
|
||||
ONE_STATE,
|
||||
THREE_STATE,
|
||||
EMPIRICAL
|
||||
}
|
||||
|
|
@ -1,301 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to manipulate machine platforms
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public enum SequencerPlatform {
|
||||
SOLEXA, // Solexa / Illumina
|
||||
ROCHE454, // 454
|
||||
SOLID, // SOLiD
|
||||
CG, // Complete Genomics
|
||||
UNKNOWN // No idea -- defaulting to 1/3
|
||||
}
|
||||
|
||||
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
|
||||
private static void bind(String s, SequencerPlatform x) {
|
||||
PLFieldToSequencerPlatform.put(s, x);
|
||||
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
|
||||
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
|
||||
}
|
||||
|
||||
//
|
||||
// Static list of platforms supported by this system.
|
||||
//
|
||||
static {
|
||||
bind("LS454", SequencerPlatform.ROCHE454);
|
||||
bind("454", SequencerPlatform.ROCHE454);
|
||||
bind("ILLUMINA", SequencerPlatform.SOLEXA);
|
||||
bind("SOLEXA", SequencerPlatform.SOLEXA);
|
||||
bind("SOLID", SequencerPlatform.SOLID);
|
||||
bind("ABI_SOLID", SequencerPlatform.SOLID);
|
||||
bind("CG", SequencerPlatform.CG);
|
||||
}
|
||||
|
||||
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
|
||||
String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase();
|
||||
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) {
|
||||
return PLFieldToSequencerPlatform.get(lcSequencerString);
|
||||
} else {
|
||||
return SequencerPlatform.UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
private static ThreadLocal<SAMRecord> lastReadForPL = new ThreadLocal<SAMRecord>();
|
||||
private static ThreadLocal<SequencerPlatform> plOfLastRead = new ThreadLocal<SequencerPlatform>();
|
||||
public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) {
|
||||
if ( lastReadForPL.get() != read ) {
|
||||
lastReadForPL.set(read);
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
final String platformName = readGroup == null ? null : readGroup.getPlatform();
|
||||
plOfLastRead.set(standardizeSequencerPlatform(platformName));
|
||||
}
|
||||
|
||||
return plOfLastRead.get();
|
||||
}
|
||||
|
||||
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||
return getReadSequencerPlatform(read).ordinal();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to get at the transition tables themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A matrix of value i x j -> log10(p) where
|
||||
*
|
||||
* i - byte of the miscalled base (i.e., A)
|
||||
* j - byte of the presumed true base (i.e., C)
|
||||
* log10p - empirical probability p that A is actually C
|
||||
*
|
||||
* The table is available for each technology
|
||||
*/
|
||||
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
|
||||
|
||||
private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) {
|
||||
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
|
||||
log10pTrueGivenMiscall.put(pl, new double[4][4]);
|
||||
|
||||
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
misCallProbs[i][j] = log10(p);
|
||||
}
|
||||
|
||||
private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
|
||||
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
|
||||
if ( logP == 0.0 )
|
||||
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase));
|
||||
else
|
||||
return logP;
|
||||
}
|
||||
|
||||
private static void addSolexa() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0);
|
||||
}
|
||||
|
||||
private static void addSOLiD() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLID;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0);
|
||||
}
|
||||
|
||||
private static void add454() {
|
||||
SequencerPlatform pl = SequencerPlatform.ROCHE454;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0);
|
||||
}
|
||||
|
||||
private static void addCG() {
|
||||
SequencerPlatform pl = SequencerPlatform.CG;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0);
|
||||
}
|
||||
|
||||
private static void addUnknown() {
|
||||
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
|
||||
for ( byte b1 : BaseUtils.BASES ) {
|
||||
for ( byte b2 : BaseUtils.BASES ) {
|
||||
if ( b1 != b2 )
|
||||
addMisCall(pl, b1, b2, 1.0/3.0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
addSolexa();
|
||||
add454();
|
||||
addSOLiD();
|
||||
addCG();
|
||||
addUnknown();
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// The actual objects themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private boolean raiseErrorOnUnknownPlatform = true;
|
||||
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
|
||||
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public EmpiricalSubstitutionProbabilities() { super(); }
|
||||
|
||||
public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) {
|
||||
super();
|
||||
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
|
||||
}
|
||||
|
||||
public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) {
|
||||
super();
|
||||
|
||||
if ( assumeUnknownPlatformsAreThis != null ) {
|
||||
raiseErrorOnUnknownPlatform = false;
|
||||
defaultPlatform = assumeUnknownPlatformsAreThis;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// calculation of p(B|GT)
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
SequencerPlatform pl = getReadSequencerPlatform(read);
|
||||
|
||||
if ( pl == SequencerPlatform.UNKNOWN ) {
|
||||
if ( raiseErrorOnUnknownPlatform )
|
||||
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
|
||||
else {
|
||||
pl = defaultPlatform;
|
||||
//System.out.printf("Using default platform %s", pl);
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.printf("%s for %s%n", pl, read);
|
||||
|
||||
double log10p;
|
||||
if ( fwdStrand ) {
|
||||
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
|
||||
} else {
|
||||
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
|
||||
}
|
||||
|
||||
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
|
||||
|
||||
return log10p;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
|
||||
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
|
||||
*/
|
||||
public class OneStateErrorProbabilities extends FourBaseLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public OneStateErrorProbabilities() { super(); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
return 0; // equivalent to e model
|
||||
}
|
||||
}
|
||||
|
|
@ -1,63 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public ThreeStateErrorProbabilities() { super(); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple log10(3) cached value
|
||||
*/
|
||||
protected static final double log103 = log10(3.0);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
return -log103; // equivalent to e / 3 model
|
||||
}
|
||||
}
|
||||
|
|
@ -1,122 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
|
||||
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
// control the output
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
public boolean GENOTYPE_MODE = false;
|
||||
|
||||
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
|
||||
public boolean ALL_BASES_MODE = false;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||
public boolean NO_SLOD = false;
|
||||
|
||||
|
||||
// control the error modes
|
||||
@Hidden
|
||||
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
public String ASSUME_SINGLE_SAMPLE = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public int MIN_BASE_QUALTY_SCORE = 17;
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
|
||||
public int MIN_MAPPING_QUALTY_SCORE = 20;
|
||||
|
||||
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
|
||||
public int MAX_MISMATCHES = 3;
|
||||
|
||||
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
|
||||
public boolean USE_BADLY_MATED_READS = false;
|
||||
|
||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.baseModel = baseModel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
uac.ALL_BASES_MODE = ALL_BASES_MODE;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
|
||||
uac.defaultPlatform = defaultPlatform;
|
||||
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING;
|
||||
uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING;
|
||||
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||
uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE;
|
||||
uac.MAX_MISMATCHES = MAX_MISMATCHES;
|
||||
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
|
||||
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||
|
||||
return uac;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,681 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
|
||||
|
||||
public class UnifiedGenotyperEngine {
|
||||
|
||||
public static final String TRIGGER_TRACK_NAME = "trigger";
|
||||
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||
|
||||
// the unified argument collection
|
||||
private UnifiedArgumentCollection UAC = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private double[] log10AlleleFrequencyPriors;
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
|
||||
|
||||
// the priors object
|
||||
private GenotypePriors genotypePriors;
|
||||
|
||||
// samples in input
|
||||
private Set<String> samples = new TreeSet<String>();
|
||||
|
||||
// the various loggers and writers
|
||||
private Logger logger = null;
|
||||
private PrintStream verboseWriter = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence for long reads
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// number of chromosomes (2 * samples) in input
|
||||
private int N;
|
||||
|
||||
// the standard filter to use for calls below the confidence threshold but above the emit threshold
|
||||
private static final Set<String> filter = new HashSet<String>(1);
|
||||
|
||||
private static final int MISMATCH_WINDOW_SIZE = 20;
|
||||
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
// get the number of samples
|
||||
// if we're supposed to assume a single sample, do so
|
||||
samples = new TreeSet<String>();
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
|
||||
|
||||
initialize(toolkit, UAC, null, null, null, samples.size());
|
||||
}
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
|
||||
this.samples = new TreeSet<String>(samples);
|
||||
initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
|
||||
}
|
||||
|
||||
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||
this.UAC = UAC;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
this.annotationEngine = engine;
|
||||
|
||||
N = 2 * numSamples;
|
||||
log10AlleleFrequencyPriors = new double[N+1];
|
||||
computeAlleleFrequencyPriors(N);
|
||||
genotypePriors = createGenotypePriors(UAC);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute full calls at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
vcc.vc = GLsToPLs(vcc.vc);
|
||||
return vcc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute GLs at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantContext object
|
||||
*/
|
||||
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
return GLsToPLs(vc);
|
||||
}
|
||||
|
||||
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) {
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( glcm.get() == null ) {
|
||||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
}
|
||||
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
|
||||
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs);
|
||||
|
||||
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
|
||||
}
|
||||
|
||||
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
// no-call everyone for now
|
||||
List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
Set<Allele> alleles = new HashSet<Allele>();
|
||||
alleles.add(refAllele);
|
||||
boolean addedAltAllele = false;
|
||||
|
||||
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( !addedAltAllele ) {
|
||||
addedAltAllele = true;
|
||||
alleles.add(GL.getAlleleA());
|
||||
alleles.add(GL.getAlleleB());
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
|
||||
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
|
||||
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
|
||||
|
||||
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
int endLoc = calculateEndPos(alleles, refAllele, loc);
|
||||
|
||||
return new VariantContext("UG_call",
|
||||
loc.getContig(),
|
||||
loc.getStart(),
|
||||
endLoc,
|
||||
alleles,
|
||||
genotypes,
|
||||
VariantContext.NO_NEG_LOG_10PERROR,
|
||||
null,
|
||||
null);
|
||||
}
|
||||
|
||||
private static VariantContext GLsToPLs(VariantContext vc) {
|
||||
if ( vc == null )
|
||||
return null;
|
||||
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
|
||||
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
|
||||
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
|
||||
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
|
||||
}
|
||||
return VariantContext.modifyGenotypes(vc, calls);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute genotypes at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @param vc the GL-annotated variant context
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
|
||||
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
|
||||
}
|
||||
|
||||
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( afcm.get() == null ) {
|
||||
log10AlleleFrequencyPosteriors.set(new double[N+1]);
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
}
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( vc.getNSamples() == 0 )
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
|
||||
|
||||
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// find the most likely frequency
|
||||
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// calculate p(f>0)
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
sum += normalizedPosteriors[i];
|
||||
double PofF = Math.min(sum, 1.0); // deal with precision errors
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( bestAFguess != 0 ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += log10AlleleFrequencyPosteriors.get()[i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
}
|
||||
|
||||
// did we trigger on the provided track?
|
||||
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
|
||||
}
|
||||
|
||||
// create the genotypes
|
||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
DbSNPFeature dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
// if the site was downsampled, record that fact
|
||||
if ( rawContext.hasPileupBeenDownsampled() )
|
||||
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||
|
||||
|
||||
if ( !UAC.NO_SLOD && bestAFguess != 0 ) {
|
||||
final boolean DEBUG_SLOD = false;
|
||||
|
||||
// the overall lod
|
||||
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
|
||||
if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
|
||||
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
|
||||
if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||
|
||||
// strand score is max bias between forward and reverse strands
|
||||
double strandScore = Math.max(forwardLod, reverseLod);
|
||||
// rescale by a factor of 10
|
||||
strandScore *= 10.0;
|
||||
//logger.debug(String.format("SLOD=%f", strandScore));
|
||||
|
||||
attributes.put("SB", Double.valueOf(strandScore));
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
|
||||
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
|
||||
|
||||
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
|
||||
vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
ReadBackedPileup pileup = null;
|
||||
if (rawContext.hasExtendedEventPileup())
|
||||
pileup = rawContext.getExtendedEventPileup();
|
||||
else if (rawContext.hasBasePileup())
|
||||
pileup = rawContext.getBasePileup();
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
|
||||
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
|
||||
}
|
||||
|
||||
VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
|
||||
call.setRefBase(refContext.getBase());
|
||||
return call;
|
||||
}
|
||||
|
||||
private int calculateEndPos(Set<Allele> alleles, Allele refAllele, GenomeLoc loc) {
|
||||
// TODO - temp fix until we can deal with extended events properly
|
||||
// for indels, stop location is one more than ref allele length
|
||||
boolean isSNP = true;
|
||||
for (Allele a : alleles){
|
||||
if (a.getBaseString().length() != 1) {
|
||||
isSNP = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int endLoc = loc.getStart();
|
||||
if ( !isSNP )
|
||||
endLoc += refAllele.length();
|
||||
|
||||
return endLoc;
|
||||
}
|
||||
|
||||
private static boolean isValidDeletionFraction(double d) {
|
||||
return ( d >= 0.0 && d <= 1.0 );
|
||||
}
|
||||
|
||||
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
|
||||
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
|
||||
|
||||
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
|
||||
|
||||
// filter the context based on min mapping quality
|
||||
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
byte ref = refContext.getBase();
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
// filter the reads (and test for bad pileups)
|
||||
if ( !filterPileup(stratifiedContexts, badReadPileupFilter) )
|
||||
return null;
|
||||
}
|
||||
|
||||
return stratifiedContexts;
|
||||
}
|
||||
|
||||
protected static void clearAFarray(double[] AFs) {
|
||||
for ( int i = 0; i < AFs.length; i++ )
|
||||
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
}
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
|
||||
double P_of_ref = initialPofRef;
|
||||
|
||||
// for each sample that we haven't examined yet
|
||||
for ( String sample : samples ) {
|
||||
boolean isCovered = contexts.containsKey(sample);
|
||||
if ( ignoreCoveredSamples && isCovered )
|
||||
continue;
|
||||
|
||||
|
||||
int depth = 0;
|
||||
|
||||
if (isCovered) {
|
||||
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
|
||||
|
||||
if (context.hasBasePileup())
|
||||
depth = context.getBasePileup().size();
|
||||
else if (context.hasExtendedEventPileup())
|
||||
depth = context.getExtendedEventPileup().size();
|
||||
}
|
||||
|
||||
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
}
|
||||
|
||||
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
}
|
||||
|
||||
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
|
||||
Allele refAllele = null, altAllele = null;
|
||||
for ( Allele allele : vc.getAlleles() ) {
|
||||
if ( allele.isReference() )
|
||||
refAllele = allele;
|
||||
else
|
||||
altAllele = allele;
|
||||
}
|
||||
|
||||
for (int i = 0; i <= N; i++) {
|
||||
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||
AFline.append(pos);
|
||||
AFline.append("\t");
|
||||
AFline.append(refAllele);
|
||||
AFline.append("\t");
|
||||
if ( altAllele != null )
|
||||
AFline.append(altAllele);
|
||||
else
|
||||
AFline.append("N/A");
|
||||
AFline.append("\t");
|
||||
AFline.append(i + "/" + N + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
|
||||
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
|
||||
AFline.append("0.00000000\t");
|
||||
else
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
|
||||
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
|
||||
verboseWriter.println(AFline.toString());
|
||||
}
|
||||
|
||||
verboseWriter.println("P(f>0) = " + PofF);
|
||||
verboseWriter.println("Qscore = " + phredScaledConfidence);
|
||||
verboseWriter.println();
|
||||
}
|
||||
|
||||
private boolean filterPileup(Map<String, StratifiedAlignmentContext> stratifiedContexts, BadBaseFilter badBaseFilter) {
|
||||
int numDeletions = 0, pileupSize = 0;
|
||||
|
||||
for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) {
|
||||
ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
|
||||
if ( p.isDeletion() ) {
|
||||
// if it's a good read, count it
|
||||
if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
|
||||
(UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) )
|
||||
numDeletions++;
|
||||
} else {
|
||||
if ( !(read instanceof GATKSAMRecord) )
|
||||
throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass());
|
||||
GATKSAMRecord GATKrecord = (GATKSAMRecord)read;
|
||||
GATKrecord.setGoodBases(badBaseFilter, true);
|
||||
if ( GATKrecord.isGoodBase(p.getOffset()) )
|
||||
pileupSize++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// now, test for bad pileups
|
||||
|
||||
// in all_bases mode, it doesn't matter
|
||||
if ( UAC.ALL_BASES_MODE )
|
||||
return true;
|
||||
|
||||
// is there no coverage?
|
||||
if ( pileupSize == 0 )
|
||||
return false;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters low quality bases out of the SAMRecords.
|
||||
*/
|
||||
private class BadBaseFilter implements GATKSAMRecordFilter {
|
||||
private ReferenceContext refContext;
|
||||
private final UnifiedArgumentCollection UAC;
|
||||
|
||||
public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) {
|
||||
this.refContext = refContext;
|
||||
this.UAC = UAC;
|
||||
}
|
||||
|
||||
public BitSet getGoodBases(final GATKSAMRecord record) {
|
||||
// all bits are set to false by default
|
||||
BitSet bitset = new BitSet(record.getReadLength());
|
||||
|
||||
// if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue
|
||||
if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE ||
|
||||
(!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) {
|
||||
return bitset;
|
||||
}
|
||||
|
||||
byte[] quals = record.getBaseQualities();
|
||||
for (int i = 0; i < quals.length; i++) {
|
||||
if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE )
|
||||
bitset.set(i);
|
||||
}
|
||||
|
||||
// if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome)
|
||||
if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) {
|
||||
GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength()));
|
||||
byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases();
|
||||
StringUtil.toUpperCase(bases);
|
||||
refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases);
|
||||
}
|
||||
|
||||
BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE);
|
||||
if ( mismatches != null )
|
||||
bitset.and(mismatches);
|
||||
|
||||
return bitset;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
*
|
||||
* @return the dbsnp rod if there is one at this position
|
||||
*/
|
||||
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
|
||||
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
}
|
||||
|
||||
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
|
||||
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
|
||||
}
|
||||
|
||||
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
|
||||
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
|
||||
}
|
||||
|
||||
protected void computeAlleleFrequencyPriors(int N) {
|
||||
// calculate the allele frequency priors for 1-N
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
double value = UAC.heterozygosity / (double)i;
|
||||
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
||||
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
|
||||
GenotypePriors priors;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
// use flat priors for GLs
|
||||
priors = new DiploidSNPGenotypePriors();
|
||||
break;
|
||||
case DINDEL:
|
||||
// create flat priors for Indels, actual priors will depend on event length to be genotyped
|
||||
priors = new DiploidIndelGenotypePriors();
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
|
||||
GenotypeLikelihoodsCalculationModel glcm;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
case DINDEL:
|
||||
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
AlleleFrequencyCalculationModel afcm;
|
||||
switch ( UAC.AFmodel ) {
|
||||
case EXACT:
|
||||
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
|
||||
break;
|
||||
case GRID_SEARCH:
|
||||
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return afcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,236 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
||||
/**
|
||||
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
|
||||
* multi-sample data. The user can choose from several different incorporated calculation models.
|
||||
*/
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.REFERENCE)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
|
||||
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
protected PrintStream verboseWriter = null;
|
||||
|
||||
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
|
||||
protected PrintStream metricsWriter = null;
|
||||
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||
|
||||
/**
|
||||
* Inner class for collecting output statistics from the UG
|
||||
*/
|
||||
public static class UGStatistics {
|
||||
/** The total number of passes examined -- i.e., the number of map calls */
|
||||
long nBasesVisited = 0;
|
||||
|
||||
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
|
||||
long nBasesCallable = 0;
|
||||
|
||||
/** The number of bases called confidently (according to user threshold), either ref or other */
|
||||
long nBasesCalledConfidently = 0;
|
||||
|
||||
/** The number of bases for which calls were emitted */
|
||||
long nCallsMade = 0;
|
||||
|
||||
/** The total number of extended events encountered */
|
||||
long nExtendedEvents = 0;
|
||||
|
||||
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesCallable); }
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the samples, output, and genotype calculation model
|
||||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample, do so
|
||||
Set<String> samples = new TreeSet<String>();
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
|
||||
}
|
||||
|
||||
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// all annotation fields from VariantAnnotatorEngine
|
||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
|
||||
// annotation (INFO) fields from UnifiedGenotyper
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
|
||||
// also, check to see whether comp rods were included
|
||||
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
|
||||
}
|
||||
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
|
||||
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
|
||||
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
|
||||
}
|
||||
}
|
||||
|
||||
// FORMAT and INFO fields
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
|
||||
|
||||
// FILTER fields
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
|
||||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
return headerInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
|
||||
}
|
||||
|
||||
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||
|
||||
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
|
||||
lhs.nBasesCallable += rhs.nBasesCallable;
|
||||
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
|
||||
lhs.nBasesVisited += rhs.nBasesVisited;
|
||||
lhs.nCallsMade += rhs.nCallsMade;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
|
||||
// we get a point for reaching reduce
|
||||
sum.nBasesVisited++;
|
||||
|
||||
// can't call the locus because of no coverage
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
// A call was attempted -- the base was potentially callable
|
||||
sum.nBasesCallable++;
|
||||
|
||||
// the base was confidently callable
|
||||
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
|
||||
|
||||
// can't make a confident variant call here
|
||||
if ( value.vc == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
// we are actually making a call
|
||||
sum.nCallsMade++;
|
||||
writer.add(value.vc, value.refBase);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(UGStatistics sum) {
|
||||
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
|
||||
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
|
||||
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||
logger.info(String.format("Actual calls made %d", sum.nCallsMade));
|
||||
|
||||
if ( metricsWriter != null ) {
|
||||
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));
|
||||
metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable));
|
||||
metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||
metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||
metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||
metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||
metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo, ebanks
|
||||
* Date: Jan 22, 2010
|
||||
* Time: 2:25:19 PM
|
||||
*
|
||||
* Useful helper class to communicate the results of calculateGenotype to framework
|
||||
*/
|
||||
public class VariantCallContext {
|
||||
public VariantContext vc = null;
|
||||
public byte refBase;
|
||||
|
||||
// Was the site called confidently, either reference or variant?
|
||||
public boolean confidentlyCalled = false;
|
||||
|
||||
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
|
||||
this.vc = vc;
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) {
|
||||
this.vc = vc;
|
||||
this.refBase = ref;
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
// blank variant context => we're a ref site
|
||||
VariantCallContext(boolean confidentlyCalledP) {
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
public void setRefBase(byte ref) {
|
||||
this.refBase = ref;
|
||||
}
|
||||
}
|
||||
|
|
@ -14,7 +14,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testBasic() {
|
||||
logger.warn("Executing testIsBetween");
|
||||
Assert.assertEquals(DELTA, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3);
|
||||
Assert.assertEquals(DELTA, DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -44,13 +44,13 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
private void testPriorsFromHet(double h, double homRef, double het, double homVar) {
|
||||
double[] vals = DiploidGenotypePriors.heterozygosity2DiploidProbabilities(h);
|
||||
double[] vals = DiploidSNPGenotypePriors.heterozygosity2DiploidProbabilities(h);
|
||||
Assert.assertEquals(vals[0], homRef, DELTA);
|
||||
Assert.assertEquals(vals[1], het, DELTA);
|
||||
Assert.assertEquals(vals[2], homVar, DELTA);
|
||||
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA);
|
||||
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HetProbability(h), het, DELTA);
|
||||
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA);
|
||||
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA);
|
||||
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HetProbability(h), het, DELTA);
|
||||
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA);
|
||||
}
|
||||
|
||||
//
|
||||
|
|
@ -71,9 +71,9 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
|
|||
private void testGenotypePriors(char ref, double h, double[] array) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double val = 0.0;
|
||||
if ( g.isHomRef((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomRefProbability(h);
|
||||
if ( g.isHet() ) val = DiploidGenotypePriors.heterozygosity2HetProbability(h);
|
||||
if ( g.isHomVar((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomVarProbability(h);
|
||||
if ( g.isHomRef((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h);
|
||||
if ( g.isHet() ) val = DiploidSNPGenotypePriors.heterozygosity2HetProbability(h);
|
||||
if ( g.isHomVar((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h);
|
||||
|
||||
val = log10(val);
|
||||
double e = array[g.ordinal()];
|
||||
|
|
@ -100,7 +100,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
private void testPolarizedGenotypePriors(char ref, double h, double pRefError, double[] array) {
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors((byte)ref, h, pRefError);
|
||||
DiploidSNPGenotypePriors priors = new DiploidSNPGenotypePriors((byte)ref, h, pRefError);
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double val = Math.pow(10, priors.getPrior(g));
|
||||
double e = array[g.ordinal()];
|
||||
|
|
|
|||
|
|
@ -22,27 +22,27 @@ public class
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testMultiSamplePilot1Joint() {
|
||||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("1dde074afe990fd38941de18bd35e188"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
Arrays.asList("aeef287589c010ff33ca5eee116c64f2"));
|
||||
executeTest("testMultiSamplePilot1", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiSamplePilot2Joint() {
|
||||
public void testMultiSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1,
|
||||
Arrays.asList("eeed5d52f4bdeed2c73ad5d202cbba26"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
Arrays.asList("59f2869835964eaa03fb9838b7d63bca"));
|
||||
executeTest("testMultiSamplePilot2", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleSamplePilot2Joint() {
|
||||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("c34a080e6799ef84f73c6e8d16f7d6ea"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
Arrays.asList("40b360318a7d0be8ae8f482816cb24c8"));
|
||||
executeTest("testSingleSamplePilot2", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -51,7 +51,7 @@ public class
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "dfbdccf53668b95e3490c198cbad77e8";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "c97fde4b21088eb0d3a5fa2136451ff7";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -78,7 +78,7 @@ public class
|
|||
|
||||
@Test
|
||||
public void testParallelization() {
|
||||
String md5 = "ce616a641a2e00d29510e99c6606d151";
|
||||
String md5 = "376069f464f0e385c409bf6b5e67cc03";
|
||||
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
|
||||
|
|
@ -105,11 +105,12 @@ public class
|
|||
@Test
|
||||
public void testParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-genotype", "243630a53c3881e23d89ce6bec448109" );
|
||||
e.put( "-all_bases", "feae0bc485e01bf102d190c7234f3c59" );
|
||||
e.put( "--min_base_quality_score 26", "48837ae22a75002d448806f6b46dd3b3" );
|
||||
e.put( "--min_mapping_quality_score 26", "d388e847b655abf56a7fd945bdff5dee" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "416ba33cf1cfc0b4b4a2ad3256c8b56b" );
|
||||
e.put( "-genotype", "707821d365ea146c74e512adfa315610" );
|
||||
e.put( "-all_bases", "bf394fc7ee5d87bb7096a301a0a5ab0a" );
|
||||
e.put( "--min_base_quality_score 26", "f0d3b8e337c9e7a00b457211f3739bda" );
|
||||
e.put( "--min_mapping_quality_score 26", "f53db471200a23713e08ac4b357e090b" );
|
||||
e.put( "--max_mismatches_in_40bp_window 5", "21b21d1c15d75db671821c4f119d6318" );
|
||||
e.put( "--p_nonref_model GRID_SEARCH", "69879139904c2f51a20291bb73763def" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -123,12 +124,12 @@ public class
|
|||
public void testConfidence() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
|
||||
Arrays.asList("aedfce6b54d3138e0459f62421185760"));
|
||||
Arrays.asList("69879139904c2f51a20291bb73763def"));
|
||||
executeTest("testConfidence1", spec1);
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
|
||||
Arrays.asList("b442c0fd6e0d3365e8bc2d9149eeb6f5"));
|
||||
Arrays.asList("b928c7c961d0fd7c8a2b6bc9c53be25e"));
|
||||
executeTest("testConfidence2", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -140,8 +141,8 @@ public class
|
|||
@Test
|
||||
public void testHeterozyosity() {
|
||||
HashMap<Double, String> e = new HashMap<Double, String>();
|
||||
e.put( 0.01, "2588b41cd3debfc2d58fdf6b463bcc07" );
|
||||
e.put( 1.0 / 1850, "18fa98796906658402cedd6cc3f28be6" );
|
||||
e.put( 0.01, "73e350d0626a94d224de2f76e621d034" );
|
||||
e.put( 1.0 / 1850, "81c1e24b956407a4e8d6385da8aadd82" );
|
||||
|
||||
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -160,8 +161,7 @@ public class
|
|||
@Test
|
||||
public void testOtherBaseCallModel() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "one_state", "62ee552b32d9193b700c1dd8e3eeb4b9" );
|
||||
e.put( "three_state", "f2971d0964780b9abfd38e860080c342" );
|
||||
e.put( "three_state", "57906723c8d483d31504437269c9f59a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
@ -184,7 +184,7 @@ public class
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("4469242939216eac5eb6b6b73ae6a509"));
|
||||
Arrays.asList("68bb882fba0d331c8f5bdf774e4e09a4"));
|
||||
|
||||
executeTest(String.format("testMultiTechnologies"), spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import java.io.File
|
|||
import net.sf.picard.reference.FastaSequenceFile
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline
|
||||
import org.broadinstitute.sting.gatk.DownsampleType
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
|
||||
import org.broadinstitute.sting.queue.extensions.samtools._
|
||||
|
|
@ -39,14 +38,7 @@ class VariantCalling(yaml: File,gatkJar: File) {
|
|||
var ug = new UnifiedGenotyper with StandardCommandLineGATK
|
||||
ug.analysisName = "UnifiedGenotyper"
|
||||
ug.input_file = bams
|
||||
ug.group :+= "Standard"
|
||||
ug.out = output
|
||||
ug.min_base_quality_score = Some(10)
|
||||
ug.min_mapping_quality_score = Some(10)
|
||||
ug.cap_base_quality_by_mapping_quality = true
|
||||
ug.standard_min_confidence_threshold_for_emitting = Some(10)
|
||||
ug.standard_min_confidence_threshold_for_calling = Some(30)
|
||||
ug.trigger_min_confidence_threshold_for_calling = Some(0)
|
||||
ug.downsample_to_coverage = Some(300)
|
||||
ug.dt = Some(DownsampleType.BY_SAMPLE)
|
||||
ug.scatterCount = 50
|
||||
|
|
|
|||
Loading…
Reference in New Issue