Fare thee well, UGv1. Here come the days UGv2.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4747 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-11-29 21:51:19 +00:00
parent 727dac7b7a
commit d89e17ec8c
51 changed files with 789 additions and 4020 deletions

View File

@ -30,7 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotypePriors;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidSNPGenotypePriors;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
@ -68,9 +68,9 @@ public class GATKPaperGenotyper extends LocusWalker<Integer,Long> implements Tre
if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,
0.01);
double likelihoods[] = DiploidSNPGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,
0.01);
// get the bases and qualities from the pileup
byte bases[] = pileup.getBases();
byte quals[] = pileup.getQuals();

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;

View File

@ -1,7 +1,31 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
public enum BaseMismatchModel {
ONE_STATE,
THREE_STATE,
EMPIRICAL
}

View File

@ -71,7 +71,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.DINDEL; }
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
public void initialize() {
@ -90,8 +90,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
}
// update the engine
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, null, null);
UG_engine.samples = samples;
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);
// initialize the header
writer.writeHeader(new VCFHeader(headerLines, samples));
@ -127,7 +126,7 @@ public class BatchedCallsMerger extends LocusWalker<VariantContext, Integer> imp
// we are missing samples, call them
if ( missedSamples.size() > 0 ) {
AlignmentContext prunedContext = filterForSamples(context.getBasePileup(), missedSamples);
VariantCallContext vcc = UG_engine.runGenotyper(tracker, ref, prunedContext);
VariantCallContext vcc = UG_engine.calculateLikelihoodsAndGenotypes(tracker, ref, prunedContext);
if ( vcc != null && vcc.vc != null )
calls.add(vcc.vc);
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Allele;

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;

View File

@ -1,292 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.vcf.VCFConstants;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import java.util.*;
public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel {
protected DiploidGenotypeCalculationModel(int N, double heterozygosity) {
super(N, heterozygosity);
}
// the GenotypeLikelihoods map
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
private HashMap<Byte, AlleleFrequencyMatrix> AFMatrixMap = new HashMap<Byte, AlleleFrequencyMatrix>();
private enum GenotypeType { REF, HET, HOM }
protected void initialize(byte ref,
Map<String, StratifiedAlignmentContext> contexts,
StratifiedAlignmentContext.StratifiedContextType contextType) {
// initialize the GenotypeLikelihoods
GLs.clear();
AFMatrixMap.clear();
// for each alternate allele, create a new matrix
for ( byte alt : BaseUtils.BASES ) {
if ( alt != ref )
AFMatrixMap.put(alt, new AlleleFrequencyMatrix(contexts.size()));
}
// use flat priors for GLs
DiploidGenotypePriors priors = new DiploidGenotypePriors();
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
// create the GenotypeLikelihoods object
GenotypeLikelihoods GL = new GenotypeLikelihoods(UAC.baseModel, priors, UAC.defaultPlatform);
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
GLs.put(sample.getKey(), GL);
double[] posteriors = GL.getPosteriors();
// for each alternate allele, fill the matrix
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
for ( byte alt : BaseUtils.BASES ) {
if ( alt != ref ) {
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
AFMatrixMap.get(alt).setLikelihoods(posteriors[refGenotype.ordinal()], posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()], sample.getKey());
}
}
}
}
protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
// first, calculate for AF=0 (no change to matrix)
log10PofDgivenAFi[baseIndex][0] = matrix.getLikelihoodsOfFrequency();
double maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][0];
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int i = 1; i <= numFrequencies; i++) {
// add one more alternatr allele
matrix.incrementFrequency();
// calculate new likelihoods
log10PofDgivenAFi[baseIndex][i] = matrix.getLikelihoodsOfFrequency();
// an optimization to speed up the calculation: if we are beyond the local maximum such
// that subsequent likelihoods won't factor into the confidence score, just quit
if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10PofDgivenAFi[baseIndex][i] > LOG10_OPTIMIZATION_EPSILON ) {
ignoreAlleleFrequenciesAboveI(i, numFrequencies, baseIndex);
return;
}
if ( log10PofDgivenAFi[baseIndex][i] > maxLikelihoodSeen )
maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][i];
}
}
protected Map<String, Genotype> makeGenotypeCalls(byte ref, byte alt, int frequency, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc) {
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
// set up some variables we'll need in the loop
AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt);
Allele refAllele = Allele.create(ref, true);
Allele altAllele = Allele.create(alt, false);
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt);
for ( String sample : GLs.keySet() ) {
// set the genotype and confidence
Pair<Integer, Double> AFbasedGenotype = matrix.getGenotype(frequency, sample);
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
if ( AFbasedGenotype.first == GenotypeType.REF.ordinal() ) {
myAlleles.add(refAllele);
myAlleles.add(refAllele);
} else if ( AFbasedGenotype.first == GenotypeType.HET.ordinal() ) {
myAlleles.add(refAllele);
myAlleles.add(altAllele);
} else { // ( AFbasedGenotype.first == GenotypeType.HOM.ordinal() )
myAlleles.add(altAllele);
myAlleles.add(altAllele);
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
// todo -- replace with GenotypeLikelihoods object in Tribble library
double[] likelihoods = GLs.get(sample).getLikelihoods();
String GL = String.format("%.2f,%.2f,%.2f",
likelihoods[refGenotype.ordinal()],
likelihoods[hetGenotype.ordinal()],
likelihoods[homGenotype.ordinal()]);
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, GL);
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
}
return calls;
}
protected static class AlleleFrequencyMatrix {
private double[][] matrix; // allele frequency matrix
private int[] indexes; // matrix to maintain which genotype is active
private int N; // total frequencies
private int frequency; // current frequency
// data structures necessary to maintain a list of the best genotypes and their scores
private ArrayList<String> samples = new ArrayList<String>();
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
public AlleleFrequencyMatrix(int N) {
this.N = N;
frequency = 0;
matrix = new double[N][3];
indexes = new int[N];
for (int i = 0; i < N; i++)
indexes[i] = 0;
}
public void setLikelihoods(double AA, double AB, double BB, String sample) {
int index = samples.size();
samples.add(sample);
matrix[index][GenotypeType.REF.ordinal()] = AA;
matrix[index][GenotypeType.HET.ordinal()] = AB;
matrix[index][GenotypeType.HOM.ordinal()] = BB;
}
public void incrementFrequency() {
if ( frequency == 2 * N )
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
frequency++;
double greedy = VALUE_NOT_CALCULATED;
int greedyIndex = -1;
for (int i = 0; i < N; i++) {
if ( indexes[i] == GenotypeType.HET.ordinal() ) {
if ( matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()] > greedy ) {
greedy = matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()];
greedyIndex = i;
}
}
else if ( indexes[i] == GenotypeType.REF.ordinal() ) {
if ( matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()] > greedy ) {
greedy = matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()];
greedyIndex = i;
}
// note that we currently don't bother with breaking ties between samples
// (which would be done by looking at the HOM_VAR value) because it's highly
// unlikely that a collision will both occur and that the difference will
// be significant at HOM_VAR...
}
// if this person is already hom var, he can't add another alternate allele
// so we can ignore that case
}
if ( greedyIndex == -1 )
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
if ( indexes[greedyIndex] == GenotypeType.HET.ordinal() )
indexes[greedyIndex] = GenotypeType.HOM.ordinal();
else
indexes[greedyIndex] = GenotypeType.HET.ordinal();
}
public double getLikelihoodsOfFrequency() {
double likelihoods = 0.0;
for (int i = 0; i < N; i++)
likelihoods += matrix[i][indexes[i]];
/*
System.out.println(frequency);
for (int i = 0; i < N; i++) {
for (int j=0; j < 3; j++) {
System.out.print(String.valueOf(matrix[i][j]));
System.out.print(indexes[i] == j ? "* " : " ");
}
System.out.println();
}
System.out.println(likelihoods);
System.out.println();
*/
recordGenotypes();
return likelihoods;
}
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
return samplesToGenotypesPerAF.get(frequency).get(sample);
}
private void recordGenotypes() {
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
int index = 0;
for ( String sample : samples ) {
int genotype = indexes[index];
double score;
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
// if the max value is for the most likely genotype, we can compute next vs. next best
if ( genotype == maxEntry ) {
if ( genotype == GenotypeType.REF.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.HET.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]);
else if ( genotype == GenotypeType.HET.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]);
else // ( genotype == GenotypeType.HOM.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HET.ordinal()]);
}
// otherwise, we need to calculate the probability of the genotype
else {
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
double chosenGenotype = normalized[genotype];
score = -1.0 * Math.log10(1.0 - chosenGenotype);
}
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
index++;
}
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
}
}
}

View File

@ -1,315 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.Arrays;
public class DiploidGenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//
// Constants and static information
//
// --------------------------------------------------------------------------------------------------------------
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
public static final double CEU_HETEROZYGOSITY = 1e-3;
public static final double YRI_HETEROZYGOSITY = 1.0 / 850;
/**
* Default value of the prob of seeing a reference error. Used to calculation the
* chance of seeing a true B/C het when the reference is A, which we assume is the product
* of the ref error rate and the het. Default value is Q60
*/
public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
// --------------------------------------------------------------------------------------------------------------
//
// Diploid priors
//
// --------------------------------------------------------------------------------------------------------------
private double[] priors = null;
// todo -- fix me when this issue is resolved
public static final boolean requirePriorSumToOne = false;
/**
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
*/
public DiploidGenotypePriors() {
priors = flatPriors.clone();
}
/**
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
* base ref
*
* @param ref
* @param heterozygosity
*/
public DiploidGenotypePriors(byte ref, double heterozygosity, boolean dontPolarize) {
if ( dontPolarize ) {
double[] vals = heterozygosity2DiploidProbabilities(heterozygosity);
priors = getReferenceIndependentPriors(ref, vals[0], vals[1], vals[2]);
} else {
priors = getReferencePolarizedPriors(ref, heterozygosity, PROB_OF_REFERENCE_ERROR);
}
}
/**
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
* base ref
*
* @param ref
* @param heterozygosity
* @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A
*/
public DiploidGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) {
priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype);
}
/**
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
*
* @param log10Priors
*/
public DiploidGenotypePriors(double[] log10Priors) {
priors = log10Priors.clone();
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors;
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
public boolean validate(boolean throwException) {
try {
if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
}
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
// --------------------------------------------------------------------------------------------------------------
//
// Static functionality
//
// --------------------------------------------------------------------------------------------------------------
/**
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
* value, as elements 0, 1, and 2 of a double[], respectively
*
* @param h the heterozygosity [probability of a base being heterozygous]
*/
@Deprecated
public static double[] heterozygosity2DiploidProbabilities(double h) {
double[] pdbls = new double[3];
pdbls[0] = heterozygosity2HomRefProbability(h);
pdbls[1] = heterozygosity2HetProbability(h);
pdbls[2] = heterozygosity2HomVarProbability(h);
return pdbls;
}
/**
*
* @param h
* @return
*/
public static double heterozygosity2HomRefProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
double v = 1.0 - (3.0 * h / 2.0);
if (MathUtils.isNegative(v)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return v;
}
public static double heterozygosity2HetProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return h;
}
public static double heterozygosity2HomVarProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return h / 2.0;
}
/**
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
* appropriately.
*
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
* then this sets the priors to:
*
* AA = hom-ref
* AC = AG = AT = (het - pTriStateGenotype) / 3
* CC = GG = TT = hom-var / 3
* CG = CT = GT = pTriStateGenotype / 3
*
* So that we get:
*
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
* hom-ref + het + hom-var
* = 1
*
* @param ref
* @param heterozyosity
* @param pRefError
*/
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
}
double pTriStateGenotype = heterozyosity * pRefError;
// if ( pTriStateGenotype >= heterozyosity ) {
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
// }
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
double pHet = heterozygosity2HetProbability(heterozyosity);
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
}
double[] priors = new double[DiploidGenotype.values().length];
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double POfG = 0.0;
final double nOnRefHets = 3;
final double nOffRefHets = 3;
final double nHomVars = 3;
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
else { POfG = pTriStateGenotype / nOffRefHets; }
priors[g.ordinal()] = Math.log10(POfG);
}
return priors;
}
/**
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
* appropriately.
*
* TODO -- delete me
*
* @param ref
* @param priorHomRef
* @param priorHet
* @param priorHomVar
*/
@Deprecated
public static double[] getReferenceIndependentPriors(byte ref, double priorHomRef, double priorHet, double priorHomVar ) {
if ((priorHomRef + priorHet + priorHomVar) != 1) {
throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar));
}
double[] priors = new double[DiploidGenotype.values().length];
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int nRefBases = MathUtils.countOccurrences((char)ref, g.toString()); // todo -- fixme
double log10POfG = 0.0;
switch ( nRefBases ) {
case 2: // hom-ref
log10POfG = Math.log10(priorHomRef);
break;
case 0: // hom-nonref
//log10POfG = Math.log10(priorHomVar / 3);
log10POfG = Math.log10(priorHomVar);
break;
default:
//log10POfG = Math.log10(priorHet / 6);
log10POfG = Math.log10(priorHet);
break;
}
priors[g.ordinal()] = log10POfG;
}
return priors;
}
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
}
}
}

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.MathUtils;

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
@ -9,7 +34,7 @@ import java.util.EnumMap;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
// --------------------------------------------------------------------------------------------------------------
//
// Static methods to manipulate machine platforms
@ -38,8 +63,8 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
bind("454", SequencerPlatform.ROCHE454);
bind("ILLUMINA", SequencerPlatform.SOLEXA);
bind("SOLEXA", SequencerPlatform.SOLEXA);
bind("solid", SequencerPlatform.SOLID);
bind("abi_solid", SequencerPlatform.SOLID);
bind("SOLID", SequencerPlatform.SOLID);
bind("ABI_SOLID", SequencerPlatform.SOLID);
bind("CG", SequencerPlatform.CG);
}
@ -253,7 +278,7 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities {
if ( pl == SequencerPlatform.UNKNOWN ) {
if ( raiseErrorOnUnknownPlatform )
throw new RuntimeException("Unknown Sequencer platform for read " + read.format() + "; your BAM file is missing the PL tag for some read groups. Please specify a default platform. See your walker's documentation to accomplish this");
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
else {
pl = defaultPlatform;
//System.out.printf("Using default platform %s", pl);

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Allele;

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;

View File

@ -23,9 +23,9 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*;
import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
public class FourBaseLikelihoodsFactory {
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
@ -35,9 +35,7 @@ public class FourBaseLikelihoodsFactory {
}
public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) {
if ( m instanceof OneStateErrorProbabilities)
return ONE_STATE;
else if ( m instanceof ThreeStateErrorProbabilities)
if ( m instanceof ThreeStateErrorProbabilities)
return THREE_STATE;
else if ( m instanceof EmpiricalSubstitutionProbabilities)
return EMPIRICAL;
@ -55,7 +53,6 @@ public class FourBaseLikelihoodsFactory {
public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m,
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
switch ( m ) {
case ONE_STATE: return new OneStateErrorProbabilities();
case THREE_STATE: return new ThreeStateErrorProbabilities();
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);

View File

@ -1,348 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods)
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(b | D) = P(b) * P(D | b)
*
* where
*
* P(D | b) = sum_i log10 P(bi | b)
*
* and
*
* P(bi | b) = 1 - P(error | q1) if bi = b
* = P(error | q1) / 3 if bi != b
*
*
*/
public abstract class FourBaseProbabilities implements Cloneable {
protected boolean enableCacheFlag = true;
//
// The fundamental data array associated with 4-base likelihoods
//
protected double[] log10Likelihoods = null;
/**
* If true, lots of output will be generated about the Likelihoods at each site
*/
private boolean verbose = false;
/**
* Bases with Q scores below this threshold aren't included in the Likelihood calculation
*/
private int minQScoreToInclude = 0;
/**
* Create a new FourBaseLikelihoods object
*/
public FourBaseProbabilities() {
log10Likelihoods = zeros.clone(); // Likelihoods are all zeros
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
FourBaseProbabilities c = (FourBaseProbabilities)super.clone();
c.log10Likelihoods = log10Likelihoods.clone();
return c;
}
public void setVerbose(boolean v) {
verbose = v;
}
public boolean isVerbose() {
return verbose;
}
public int getMinQScoreToInclude() {
return minQScoreToInclude;
}
public void setMinQScoreToInclude(int minQScoreToInclude) {
this.minQScoreToInclude = minQScoreToInclude;
}
/**
* Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
* @return probs
*/
public double[] getLog10Likelihoods() {
return log10Likelihoods;
}
/**
* Returns the likelihood associated with a base
* @param base base
* @return log10 likelihood as a double
*/
public double getLog10Likelihood(byte base) {
return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base));
}
public double getLog10Likelihood(int baseIndex) {
return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]);
}
/**
* Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
* @return probs
*/
public double[] getLikelihoods() {
double[] probs = new double[4];
for (int i = 0; i < 4; i++)
probs[i] = Math.pow(10, log10Likelihoods[i]);
return probs;
}
/**
* Returns the likelihoods associated with a base
* @param base base
* @return likelihoods as a double
*/
public double getLikelihood(byte base) {
return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base));
}
public double getLikelihood(int baseIndex) {
return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex]));
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// add() -- the heart of
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase observed base
* @param qualityScore base quality
* @param read SAM read
* @param offset offset on read
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
*/
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
FourBaseProbabilities fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset);
if ( fbl == null )
return 0;
for ( byte base : BaseUtils.BASES ) {
double likelihood = fbl.getLikelihood(base);
log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood;
}
if ( verbose ) {
for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); }
System.out.println();
for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); }
System.out.println();
}
return 1;
}
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase observed base
* @param qualityScore base quality
* @param read SAM read
* @param offset offset on read
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
*/
public FourBaseProbabilities computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
if ( badBase(observedBase) ) {
return null;
}
try {
if ( qualityScore > getMinQScoreToInclude() ) {
FourBaseProbabilities fbl = (FourBaseProbabilities)this.clone();
fbl.log10Likelihoods = zeros.clone();
for ( byte base : BaseUtils.BASES ) {
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset);
if ( verbose ) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
}
fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
}
return fbl;
}
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
return null;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
* is considered bad if:
*
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
*
* @param observedBase observed base
* @return true if the base is a bad base
*/
private boolean badBase(byte observedBase) {
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
}
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for ( byte base : BaseUtils.BASES ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex]));
sum += Math.pow(10, log10Likelihoods[baseIndex]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overlaods this
public int getReadSequencerPlatformIndex( SAMRecord read ) {
return 0;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
for ( byte base : BaseUtils.BASES ) {
int i = BaseUtils.simpleBaseToBaseIndex(base);
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
throw new IllegalStateException(String.format("At %s: %s", base, bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Hearty math calculations follow
//
// -- these should not be messed with unless you know what you are doing
//
// -----------------------------------------------------------------------------------------------------------------
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) {
if (qual == 0) { // zero quals are wrong
throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s",
observedBase, chromBase, qual, offset, read));
}
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
/**
* Must be overridden by concrete subclasses
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset);
//
// Constant static data
//
private final static double[] zeros = new double[BaseUtils.BASES.length];
static {
for ( byte base : BaseUtils.BASES ) {
zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
}
}
}

View File

@ -1,39 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
public class FourBaseProbabilitiesFactory {
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
public static BaseMismatchModel getBaseMismatchModel(final String name) {
return valueOf(name);
}
public static BaseMismatchModel getBaseMismatchModel(final FourBaseProbabilities m) {
if ( m instanceof OneStateErrorProbabilities)
return ONE_STATE;
else if ( m instanceof ThreeStateErrorProbabilities)
return THREE_STATE;
else if ( m instanceof EmpiricalSubstitutionProbabilities)
return EMPIRICAL;
throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass());
}
/**
* General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models
*
* @param m model
* @param pl default platform
* @return new 4-base model
*/
public static FourBaseProbabilities makeFourBaseLikelihoods(BaseMismatchModel m,
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
switch ( m ) {
case ONE_STATE: return new OneStateErrorProbabilities();
case THREE_STATE: return new ThreeStateErrorProbabilities();
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
}
}
}

View File

@ -1,93 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.PrintStream;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
*/
public abstract class GenotypeCalculationModel implements Cloneable {
public enum Model {
JOINT_ESTIMATE,
DINDEL
}
protected UnifiedArgumentCollection UAC;
protected Set<String> samples;
protected Logger logger;
protected PrintStream verboseWriter;
/**
* Create a new GenotypeCalculationModel object
*/
protected GenotypeCalculationModel() {}
/**
* Initialize the GenotypeCalculationModel object
* Assumes that out is not null
* @param samples samples in input bam
* @param logger logger
* @param UAC unified arg collection
* @param verboseWriter verbose writer
*/
protected void initialize(Set<String> samples,
Logger logger,
UnifiedArgumentCollection UAC,
PrintStream verboseWriter) {
this.UAC = UAC.clone();
this.samples = new TreeSet<String>(samples);
this.logger = logger;
this.verboseWriter = verboseWriter;
}
/**
* Must be overridden by concrete subclasses
* @param tracker rod data
* @param ref reference base
* @param loc GenomeLoc
* @param stratifiedContexts stratified alignment contexts
* @param priors priors to use for GL
*
* @return call
*/
public abstract VariantCallContext callLocus(RefMetaDataTracker tracker,
byte ref,
GenomeLoc loc,
Map<String, StratifiedAlignmentContext> stratifiedContexts,
DiploidGenotypePriors priors);
/**
* @param tracker rod data
* @param ref reference base
* @param loc GenomeLoc
* @param stratifiedContexts stratified alignment contexts
*
* @return call
*/
public abstract VariantCallContext callExtendedLocus(RefMetaDataTracker tracker,
byte[] ref,
GenomeLoc loc,
Map<String, StratifiedAlignmentContext> stratifiedContexts);
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
public static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
}
}

View File

@ -1,70 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*;
import org.apache.log4j.Logger;
import java.util.Set;
import java.io.PrintStream;
public class GenotypeCalculationModelFactory {
public static GenotypeCalculationModel.Model getGenotypeCalculationModel(final String name) {
return valueOf(name);
}
/**
* General and correct way to create GenotypeCalculationModel objects for arbitrary models
*
* @param samples samples in bam
* @param logger logger
* @param UAC the unified argument collection
* @param verboseWriter verbose writer
*
* @return model
*/
public static GenotypeCalculationModel makeGenotypeCalculation(Set<String> samples,
Logger logger,
UnifiedArgumentCollection UAC,
PrintStream verboseWriter) {
GenotypeCalculationModel gcm;
switch ( UAC.genotypeModel ) {
case JOINT_ESTIMATE:
gcm = new DiploidGenotypeCalculationModel(samples.size(), UAC.heterozygosity);
break;
case DINDEL:
throw new UnsupportedOperationException("The Dindel-based genotype likelihoods model is not currently supported");
//gcm = new SimpleIndelCalculationModel();
//break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
}
gcm.initialize(samples, logger, UAC, verboseWriter);
return gcm;
}
}

View File

@ -1,509 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(G | D) = P(G) * P(D | G)
*
* where
*
* P(D | G) = sum_i log10 P(bi | G)
*
* and
*
* P(bi | G) = 1 - P(error | q1) if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for homozygous genotypes and for heterozygous genotypes:
*
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
*
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
*
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
public class GenotypeLikelihoods implements Cloneable {
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected boolean enableCacheFlag = true;
protected boolean VERBOSE = false;
//
// The fundamental data arrays associated with a Genotype Likelhoods object
//
protected double[] log10Likelihoods = null;
protected double[] log10Posteriors = null;
protected DiploidGenotypePriors priors = null;
protected FourBaseProbabilities fourBaseLikelihoods = null;
/**
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
*
* @param m base model
*/
public GenotypeLikelihoods(BaseMismatchModel m) {
this.priors = new DiploidGenotypePriors();
initialize(m, null);
}
/**
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
*
* @param m base model
* @param pl default platform
*/
public GenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
this.priors = new DiploidGenotypePriors();
initialize(m, pl);
}
/**
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
*
* @param m base model
* @param priors priors
*/
public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors) {
this.priors = priors;
initialize(m, null);
}
/**
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
*
* @param m base model
* @param priors priors
* @param pl default platform
*/
public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
this.priors = priors;
initialize(m, pl);
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone();
c.priors = priors;
c.log10Likelihoods = log10Likelihoods.clone();
c.log10Posteriors = log10Posteriors.clone();
c.fourBaseLikelihoods = (FourBaseProbabilities)fourBaseLikelihoods.clone();
return c;
}
protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(m, pl);
setToZero();
}
protected void setToZero() {
log10Likelihoods = zeros.clone(); // likelihoods are all zeros
log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors
}
public void setVerbose(boolean v) {
VERBOSE = v;
fourBaseLikelihoods.setVerbose(v);
}
public boolean isVerbose() {
return VERBOSE;
}
public int getMinQScoreToInclude() {
return fourBaseLikelihoods.getMinQScoreToInclude();
}
public void setMinQScoreToInclude(int minQScoreToInclude) {
fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude);
}
/**
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
/**
* Returns the likelihood associated with DiploidGenotype g
* @param g genotype
* @return log10 likelihood as a double
*/
public double getLikelihood(DiploidGenotype g) {
return getLikelihoods()[g.ordinal()];
}
/**
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return raw log10 (not-normalized posteriors) as a double array
*/
public double[] getPosteriors() {
return log10Posteriors;
}
/**
* Returns the posterior associated with DiploidGenotype g
* @param g genotpe
* @return raw log10 (not-normalized posteror) as a double
*/
public double getPosterior(DiploidGenotype g) {
return getPosteriors()[g.ordinal()];
}
/**
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return normalized posterors as a double array
*/
public double[] getNormalizedPosteriors() {
double[] normalized = new double[log10Posteriors.length];
double sum = 0.0;
// for precision purposes, we need to add (or really subtract, since everything is negative)
// the largest posterior value from all entries so that numbers don't get too small
double maxValue = log10Posteriors[0];
for (int i = 1; i < log10Posteriors.length; i++) {
if ( maxValue < log10Posteriors[i] )
maxValue = log10Posteriors[i];
}
// collect the posteriors
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double posterior = Math.pow(10, getPosterior(g) - maxValue);
normalized[g.ordinal()] = posterior;
sum += posterior;
}
// normalize
for (int i = 0; i < normalized.length; i++)
normalized[i] /= sum;
return normalized;
}
public DiploidGenotypePriors getPriorObject() {
return priors;
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors.getPriors();
}
/**
* Sets the priors
* @param priors priors
*/
public void setPriors(DiploidGenotypePriors priors) {
this.priors = priors;
log10Posteriors = zeros.clone();
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i];
}
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g genotype
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
/**
* Simple function to overload to control the caching of genotype likelihood outputs.
* By default the function trues true -- do enable caching. If you are experimenting with an
* complex calcluation of P(B | G) and caching might not work correctly for you, overload this
* function and return false, so the super() object won't try to cache your GL calculations.
*
* @return true if caching should be enabled, false otherwise
*/
public boolean cacheIsEnabled() {
return enableCacheFlag;
}
public void setEnableCacheFlag(boolean enable) {
enableCacheFlag = enable;
}
public int add(ReadBackedPileup pileup) {
return add(pileup, false, false);
}
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
int n = 0;
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
byte base = p.getBase();
if ( ! ignoreBadBases || ! badBase(base) ) {
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
n += add(base, qual, p.getRead(), p.getOffset());
}
}
return n;
}
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
GenotypeLikelihoods gl;
if ( cacheIsEnabled() ) {
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
} else {
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
}
} else {
gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
}
// for bad bases, there are no likelihoods
if ( gl == null )
return 0;
double[] likelihoods = gl.getLikelihoods();
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double likelihood = likelihoods[g.ordinal()];
if ( VERBOSE ) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
}
log10Likelihoods[g.ordinal()] += likelihood;
log10Posteriors[g.ordinal()] += likelihood;
}
return 1;
}
static GenotypeLikelihoods[][][][][][] CACHE = new GenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
}
protected GenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
GenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
if ( gl == null )
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
observedBase, qualityScore, ploidy, read));
return gl;
}
protected GenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
GenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
return gl;
}
protected void setCache( GenotypeLikelihoods[][][][][][] cache,
byte observedBase, byte qualityScore, int ploidy,
SAMRecord read, GenotypeLikelihoods val ) {
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
int j = qualityScore;
int k = ploidy;
int x = strandIndex(! read.getReadNegativeStrandFlag() );
cache[m][a][i][j][k][x] = val;
}
protected GenotypeLikelihoods getCache( GenotypeLikelihoods[][][][][][] cache,
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
int j = qualityScore;
int k = ploidy;
int x = strandIndex(! read.getReadNegativeStrandFlag() );
return cache[m][a][i][j][k][x];
}
protected GenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
if ( fbl == null )
return null;
double[] fbLikelihoods = fbl.getLog10Likelihoods();
try {
GenotypeLikelihoods gl = (GenotypeLikelihoods)this.clone();
gl.setToZero();
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
double ploidyAdjustment = log10(FIXED_PLOIDY);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
double p_base = 0.0;
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
double likelihood = log10(p_base);
gl.log10Likelihoods[g.ordinal()] += likelihood;
gl.log10Posteriors[g.ordinal()] += likelihood;
}
if ( VERBOSE ) {
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
System.out.println();
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
System.out.println();
}
return gl;
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public static int strandIndex(boolean fwdStrand) {
return fwdStrand ? 0 : 1;
}
/**
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
* is considered bad if:
*
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
*
* @param observedBase observed base
* @return true if the base is a bad base
*/
protected boolean badBase(byte observedBase) {
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
}
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for (DiploidGenotype g : DiploidGenotype.values()) {
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
priors.validate(throwException);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
String bad = null;
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
} else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) {
bad = String.format("Posterior %f is badly formed", log10Posteriors[i]);
}
if ( bad != null ) {
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
//
// Constant static data
//
final static double[] zeros = new double[DiploidGenotype.values().length];
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
zeros[g.ordinal()] = 0.0;
}
}
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
public interface GenotypePriors {

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Genotype;

View File

@ -1,401 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.*;
import java.util.*;
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
// for use in optimizing the P(D|AF) calculations:
// how much off from the max likelihoods do we need to be before we can quit calculating?
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
private int minAlleleFrequencyToTest;
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
protected double[] log10AlleleFrequencyPriors;
// the allele frequency posteriors and P(f>0) for each alternate allele
protected double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][];
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
protected double[] PofFs = new double[BaseUtils.BASES.length];
// the alternate allele with the largest sum of quality scores
protected Byte bestAlternateAllele = null;
// are we at a 'trigger' track site?
protected boolean atTriggerTrack = false;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
protected static final Set<String> filter = new HashSet<String>(1);
protected JointEstimateGenotypeCalculationModel(int N, double heterozygosity) {
filter.add(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME);
computeAlleleFrequencyPriors(N, heterozygosity);
}
public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> stratifiedContexts) {
return null;
}
public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
int numSamples = getNSamples(contexts);
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
// reset the optimization value
minAlleleFrequencyToTest = 0;
// find the alternate allele with the largest sum of quality scores
initializeBestAlternateAllele(ref, contexts);
// did we trigger on the provided track?
atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0;
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// if we don't want all bases, then we don't need to calculate genotype likelihoods
if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) {
VariantCallContext vcc = new VariantCallContext(false);
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false);
return vcc;
}
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (byte)(ref != 'A' ? 'A' : 'C');
}
// calculate likelihoods if there are non-ref bases
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
calculatePofFs(ref, frequencyEstimationPoints);
// print out stats if we have a writer
if ( verboseWriter != null )
printAlleleFrequencyData(ref, loc, frequencyEstimationPoints);
VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints);
// technically, at this point our confidence in a reference call isn't accurately
// estimated because it didn't take into account samples with no data
if ( vcc.vc == null )
estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true);
return vcc;
}
protected int getMinAlleleFrequencyToTest() {
return minAlleleFrequencyToTest;
}
protected int getNSamples(Map<String, StratifiedAlignmentContext> contexts) {
return contexts.size();
}
protected void initializeBestAlternateAllele(byte ref, Map<String, StratifiedAlignmentContext> contexts) {
int[] qualCounts = new int[4];
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
// calculate the sum of quality scores for each base
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
if ( index >= 0 )
qualCounts[index] += p.getQual();
}
}
// set the non-ref base with maximum quality score sum
int maxCount = 0;
bestAlternateAllele = null;
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
bestAlternateAllele = altAllele;
}
}
}
protected void initialize(byte ref, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
// by default, no initialization is done
return;
}
protected void computeAlleleFrequencyPriors(int N, double heterozygosity) {
int AFs = (2 * N) + 1;
log10AlleleFrequencyPriors = new double[AFs];
// calculate the allele frequency priors for 1-N
double sum = 0.0;
for (int i = 1; i < AFs; i++) {
double value = heterozygosity / (double)i;
log10AlleleFrequencyPriors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
}
private void estimateReferenceConfidence(VariantCallContext vcc, Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
double P_of_ref = 1.0;
// use the AF=0 prob if it's calculated
if ( ignoreCoveredSamples )
P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
// for each sample that we haven't examined yet
for ( String sample : samples ) {
boolean isCovered = contexts.containsKey(sample);
if ( ignoreCoveredSamples && isCovered )
continue;
int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
}
vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING;
}
protected void calculateAlleleFrequencyPosteriors(byte ref, int frequencyEstimationPoints, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
// initialization
for ( byte altAllele : BaseUtils.BASES ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
alleleFrequencyPosteriors[baseIndex] = new double[frequencyEstimationPoints];
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
}
// only calculate for the most likely alternate allele
calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType);
}
/********************************************************************************/
/*** One or both of the following methods should be overloaded in subclasses ***/
/*** so that the correct calculation is made for PofDgivenAFi ***/
/********************************************************************************/
/**
* @param ref the ref base
* @param alt the alt base
* @param numFrequencies total number of allele frequencies (2N)
* @param contexts stratified alignment contexts
* @param contextType which stratification to use
*/
protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt);
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int i = 0; i <= numFrequencies; i++) {
double f = (double)i / (double)(numFrequencies);
log10PofDgivenAFi[baseIndex][i] += calculateLog10PofDgivenAFforF(ref, alt, f, contexts, contextType);
}
}
/**
* @param ref the ref base
* @param alt the alt base
* @param f the allele frequency
* @param contexts stratified alignment contexts
* @param contextType which stratification to use
*
* @return value of PofDgivenAF for allele frequency f
*/
protected double calculateLog10PofDgivenAFforF(byte ref, byte alt, double f, Map<String, StratifiedAlignmentContext> contexts, StratifiedAlignmentContext.StratifiedContextType contextType) {
return 0.0;
}
/********************************************************************************/
/**
* @param freqI allele frequency I
* @param numFrequencies total number of allele frequencies
* @param altBaseIndex the index of the alternate allele
*/
protected void ignoreAlleleFrequenciesAboveI(int freqI, int numFrequencies, int altBaseIndex) {
while ( ++freqI <= numFrequencies )
log10PofDgivenAFi[altBaseIndex][freqI] = VALUE_NOT_CALCULATED;
}
/**
* @param ref the ref base
* @param frequencyEstimationPoints number of allele frequencies
*/
protected void calculatePofFs(byte ref, int frequencyEstimationPoints) {
// only calculate for the most likely alternate allele
int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
// multiply by null allele frequency priors to get AF posteriors, then normalize
for (int i = 0; i < frequencyEstimationPoints; i++)
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
// calculate p(f>0)
double sum = 0.0;
for (int i = 1; i < frequencyEstimationPoints; i++)
sum += alleleFrequencyPosteriors[baseIndex][i];
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
}
/**
* @param ref the ref base
* @param loc the GenomeLoc
* @param frequencyEstimationPoints number of allele frequencies
*/
protected void printAlleleFrequencyData(byte ref, GenomeLoc loc, int frequencyEstimationPoints) {
for (int i = 0; i < frequencyEstimationPoints; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t");
AFline.append(loc).append("\t");
AFline.append(i + "/" + (frequencyEstimationPoints-1) + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/ (frequencyEstimationPoints-1)));
AFline.append(String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t");
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
double PofDgivenAF = log10PofDgivenAFi[baseIndex][i];
if ( PofDgivenAF == VALUE_NOT_CALCULATED )
PofDgivenAF = 0.0;
AFline.append(String.format("%.8f\t%.8f\t", PofDgivenAF, alleleFrequencyPosteriors[baseIndex][i]));
} else {
AFline.append(String.format("%.8f\t%.8f\t", -1.0, -1.0));
}
}
verboseWriter.println(AFline);
}
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele != ref ) {
char base = Character.toLowerCase((char)altAllele);
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( MathUtils.compareDoubles(PofFs[baseIndex], 0.0) != 0 ) {
double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0]);
if ( Double.isInfinite(phredScaledConfidence) ) {
phredScaledConfidence = -10.0 * log10PofDgivenAFi[baseIndex][0];
verboseWriter.println("P(f>0)_" + base + " = 1");
verboseWriter.println("Qscore_" + base + " = " + phredScaledConfidence);
verboseWriter.println("LOD_" + base + " = " + phredScaledConfidence);
} else {
verboseWriter.println("P(f>0)_" + base + " = " + PofFs[baseIndex]);
verboseWriter.println("Qscore_" + base + " = " + (QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0])));
verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
}
}
}
}
verboseWriter.println();
}
protected Map<String, Genotype> makeGenotypeCalls(byte ref, byte alt, int frequency, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc) {
// by default, we return no genotypes
return new HashMap<String, Genotype>();
}
protected VariantCallContext createCalls(RefMetaDataTracker tracker, byte ref, Map<String, StratifiedAlignmentContext> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
// only need to look at the most likely alternate allele
int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);
int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
double phredScaledConfidence;
if ( bestAFguess != 0 ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]);
if ( Double.isInfinite(phredScaledConfidence) ) {
double sum = 0.0;
for (int i = 1; i < frequencyEstimationPoints; i++) {
if ( log10PofDgivenAFi[indexOfMax][i] == VALUE_NOT_CALCULATED )
break;
sum += log10PofDgivenAFi[indexOfMax][i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess) )
return new VariantCallContext(passesCallThreshold(phredScaledConfidence));
// populate the sample-specific data (output it to beagle also if requested)
Map<String, Genotype> genotypes = makeGenotypeCalls(ref, bestAlternateAllele, bestAFguess, contexts, loc);
// next, the variant context data (alleles, attributes, etc.)
ArrayList<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create(ref, true));
if ( bestAFguess != 0 )
alleles.add(Allele.create(bestAlternateAllele, false));
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
DbSNPFeature dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
if ( !UAC.NO_SLOD ) {
// the overall lod
double overallLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
double overallLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
double lod = overallLog10PofF - overallLog10PofNull;
// set the optimization value for the subsequent strand calculations
minAlleleFrequencyToTest = bestAFguess;
// the forward lod
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
calculatePofFs(ref, frequencyEstimationPoints);
double forwardLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
double forwardLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
// the reverse lod
initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
calculatePofFs(ref, frequencyEstimationPoints);
double reverseLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0];
double reverseLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess];
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofNull;
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofNull;
//logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
// rescale by a factor of 10
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
attributes.put("SB", Double.valueOf(strandScore));
}
VariantContext vc = new VariantContext("UG_SNP_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes);
return new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence));
}
protected boolean passesEmitThreshold(double conf, int bestAFguess) {
return (atTriggerTrack ?
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
}
protected boolean passesCallThreshold(double conf) {
return (atTriggerTrack ?
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
}
}

View File

@ -1,35 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
/**
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
*/
public class OneStateErrorProbabilities extends FourBaseProbabilities {
//
// forwarding constructors -- don't do anything at all
//
public OneStateErrorProbabilities() { super(); }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
return 0; // equivalent to e model
}
}

View File

@ -23,7 +23,7 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.StingException;

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.*;
import java.util.*;
public class SimpleIndelCalculationModel extends GenotypeCalculationModel {
private final GenomeLocParser genomeLocParser;
private int MIN_COVERAGE = 6;
private double MIN_FRACTION = 0.3;
private double MIN_CONSENSUS_FRACTION = 0.7 ;
// the previous normal event context
// private Map<String, StratifiedAlignmentContext> cachedContext;
protected SimpleIndelCalculationModel(GenomeLocParser genomeLocParser) {
this.genomeLocParser = genomeLocParser;
}
private int totalIndels = 0;
private int totalCoverage = 0;
private int bestIndelCount = 0;
private String bestEvent = null;
public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts, DiploidGenotypePriors priors) {
// cachedContext = contexts;
return null;
}
public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map<String, StratifiedAlignmentContext> contexts) {
totalIndels = 0;
totalCoverage = 0;
bestIndelCount = 0;
bestEvent = null;
/*
System.out.println("\nReached " + loc + " through an extended event");
for (Map.Entry<String,StratifiedAlignmentContext> e : contexts.entrySet()) {
System.out.println("Set "+e.getKey());
System.out.println(" Context: "+e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()+ " reads");
ReadBackedExtendedEventPileup p = e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup();
if ( p== null ) System.out.println("EXTENDED PILEUP IS NULL");
System.out.println(" Event(s): " + e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup().getEventStringsWithCounts(ref));
}
*/
initializeAlleles(ref, contexts);
VariantCallContext vcc = new VariantCallContext(false);
if ( totalIndels == 0 ) return vcc; // this can happen if indel-containing reads get filtered out by the engine
if ( totalCoverage < MIN_COVERAGE ) return vcc;
if ( ((double)bestIndelCount)/totalCoverage < MIN_FRACTION ) return vcc;
if ( ((double)bestIndelCount)/totalIndels < MIN_CONSENSUS_FRACTION ) return vcc;
List<Allele> alleles = new ArrayList<Allele>(2);
if ( bestEvent.charAt(0) == '+') {
alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,true) );
alleles.add( Allele.create(bestEvent.substring(1), false ));
} else {
if ( bestEvent.charAt(0) == '-' ) {
alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,false) );
alleles.add( Allele.create(bestEvent.substring(1), true ));
loc = genomeLocParser.setStop(loc, loc.getStop() + bestEvent.length()-1);
} else
throw new ReviewedStingException("Internal error (probably a bug): event does not conform to expected format: "+ bestEvent);
}
VariantContext vc = new VariantContext("UG_Indel_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, new HashMap<String, Genotype>() /* genotypes */,
-1.0 /* log error */, null /* filters */, null /* attributes */);
vcc = new VariantCallContext(vc,true);
vcc.setRefBase(ref[0]);
/*
if ( totalIndels > 0 ) {
System.out.println("Calling: "+bestEvent+" ["+bestIndelCount+"/"+totalIndels+"/"+totalCoverage+"] at "+loc);
} else {
System.out.println("NO EVENT");
}
*/
//VariantContext vc = new MutableVariantContext("UG_indel_call", loc, alleles, genotypes, phredScaledConfidence/10.0, null, attributes);
//return new VariantCallContext(vc, phredScaledConfidence >= CONFIDENCE_THRESHOLD);
return vcc;
}
protected void initializeAlleles(byte[] ref, Map<String, StratifiedAlignmentContext> contexts) {
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
totalCoverage += context.size();
// calculate the sum of quality scores for each base
ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup();
List<Pair<String,Integer>> all_events = pileup.getEventStringsWithCounts(ref);
for ( Pair<String,Integer> p : all_events ) {
if ( p.second > bestIndelCount ) {
bestIndelCount = p.second;
bestEvent = p.first;
}
totalIndels += p.second;
}
}
}
}

View File

@ -1,10 +1,35 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import static java.lang.Math.log10;
import net.sf.samtools.SAMRecord;
public class ThreeStateErrorProbabilities extends FourBaseProbabilities {
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
//
// forwarding constructors -- don't do anything at all
//

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2010 The Broad Institute
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -32,14 +32,17 @@ import org.broadinstitute.sting.commandline.Hidden;
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default.", required = false)
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.JOINT_ESTIMATE;
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the THREE_STATE model for comparison purposes", required = false)
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
// control the output
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
@ -49,16 +52,16 @@ public class UnifiedArgumentCollection {
public boolean ALL_BASES_MODE = false;
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 50.0;
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 50.0;
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
public double TRIGGER_CONFIDENCE_FOR_CALLING = 50.0;
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 50.0;
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;
@ -76,7 +79,7 @@ public class UnifiedArgumentCollection {
// control the various parameters to be used
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 20;
public int MIN_BASE_QUALTY_SCORE = 17;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
public int MIN_MAPPING_QUALTY_SCORE = 20;
@ -90,14 +93,11 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
public boolean CAP_BASE_QUALITY = false;
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.genotypeModel = genotypeModel;
uac.GLmodel = GLmodel;
uac.baseModel = baseModel;
uac.heterozygosity = heterozygosity;
uac.GENOTYPE_MODE = GENOTYPE_MODE;

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
@ -44,21 +45,18 @@ import java.io.PrintStream;
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
* multi-sample data. The user can choose from several different incorporated calculation models.
*/
@Reference(window=@Window(start=-20,stop=20))
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
@Deprecated
protected String varout;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@ -81,7 +79,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.DINDEL; }
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
/**
* Inner class for collecting output statistics from the UG
@ -104,7 +102,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesCallable); }
}
/**
@ -112,21 +110,23 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
*
**/
public void initialize() {
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, verboseWriter, annotationEngine);
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
Set<String> samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the writers
if ( verboseWriter != null ) {
StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t");
for ( byte altAllele : BaseUtils.BASES ) {
header.append("POfDGivenAFFor" + (char)altAllele + "\t");
header.append("PosteriorAFFor" + (char)altAllele + "\t");
}
verboseWriter.println(header);
}
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
// initialize the header
writer.writeHeader(new VCFHeader(getHeaderInfo(), UG_engine.samples)) ;
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
}
private Set<VCFHeaderLine> getHeaderInfo() {
@ -136,14 +136,24 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
for ( Map.Entry<String, String> dbSet : UG_engine.dbAnnotations.entrySet() )
headerInfo.add(new VCFInfoHeaderLine(dbSet.getValue(), 0, VCFHeaderLineType.Flag, (dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ? "dbSNP" : dbSet.getValue()) + " Membership"));
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
// also, check to see whether comp rods were included
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
}
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
}
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.GENOTYPE_LIKELIHOODS_KEY));
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
@ -162,7 +172,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
* @return the VariantCallContext object
*/
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return UG_engine.runGenotyper(tracker, refContext, rawContext);
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
}
public UGStatistics reduceInit() { return new UGStatistics(); }

View File

@ -26,202 +26,656 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFWriter;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.dbsnp.DbSNPFeature;
import java.io.PrintStream;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.IndexedFastaSequenceFile;
public class UnifiedGenotyperEngine {
public static final String TRIGGER_TRACK_NAME = "trigger";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
protected HashMap<String, String> dbAnnotations = new HashMap<String, String>();
// the unified argument collection
protected UnifiedArgumentCollection UAC = null;
private UnifiedArgumentCollection UAC = null;
// the annotation engine
protected VariantAnnotatorEngine annotationEngine;
private VariantAnnotatorEngine annotationEngine;
// the model used for calculating genotypes
protected ThreadLocal<GenotypeCalculationModel> gcm = new ThreadLocal<GenotypeCalculationModel>();
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
// the various loggers and writers
protected Logger logger = null;
protected VCFWriter vcfWriter = null;
protected PrintStream verboseWriter = null;
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private double[] log10AlleleFrequencyPriors;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
// the priors object
private GenotypePriors genotypePriors;
// samples in input
protected Set<String> samples = new HashSet<String>();
private Set<String> samples = new TreeSet<String>();
// the various loggers and writers
private Logger logger = null;
private PrintStream verboseWriter = null;
// fasta reference reader to supplement the edges of the reference sequence for long reads
private IndexedFastaSequenceFile referenceReader;
// number of chromosomes (2 * samples) in input
private int N;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private static final int MISMATCH_WINDOW_SIZE = 20;
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
initialize(toolkit, UAC, null, null, null, null);
// get the number of samples
// if we're supposed to assume a single sample, do so
samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
initialize(toolkit, UAC, null, null, null, samples.size());
}
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) {
initialize(toolkit, UAC, logger, genotypeWriter, verboseWriter, engine);
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
this.samples = new TreeSet<String>(samples);
initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
}
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) {
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
this.UAC = UAC;
this.logger = logger;
this.vcfWriter = genotypeWriter;
this.verboseWriter = verboseWriter;
this.annotationEngine = engine;
// deal with input errors
if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) {
// the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name
throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads; please run again without the -nt argument");
}
N = 2 * numSamples;
log10AlleleFrequencyPriors = new double[N+1];
computeAlleleFrequencyPriors(N);
genotypePriors = createGenotypePriors(UAC);
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
this.samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
filter.add(LOW_QUAL_FILTER_NAME);
// check to see whether comp rods were included
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
dbAnnotations.put(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, VCFConstants.DBSNP_KEY);
}
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
dbAnnotations.put(source.getName(), source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()));
}
}
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
}
/**
* Compute at a given locus.
* Compute full calls at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
if ( gcm.get() == null ) {
gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, verboseWriter));
}
byte ref = refContext.getBase();
if ( !BaseUtils.isRegularBase(ref) )
public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if ( vc == null )
return null;
VariantCallContext call;
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
vcc.vc = GLsToPLs(vcc.vc);
return vcc;
}
if ( rawContext.hasExtendedEventPileup() ) {
/**
* Compute GLs at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantContext object
*/
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
return GLsToPLs(vc);
}
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) {
if ( stratifiedContexts == null )
return null;
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// stratify the AlignmentContext and cut by sample
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
if ( stratifiedContexts == null )
return null;
call = gcm.get().callExtendedLocus(tracker, refContext.getBasesAtLocus(-1), rawContext.getLocation(), stratifiedContexts);
} else {
ReadBackedPileup rawPileup = rawContext.getBasePileup();
// filter the context based on min base and mapping qualities
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// are there too many deletions in the pileup?
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.DINDEL &&
isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
return null;
// stratify the AlignmentContext and cut by sample
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
if ( stratifiedContexts == null )
return null;
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_REFERENCE_ERROR);
call = gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors);
// annotate the call, if possible
if ( call != null && call.vc != null && annotationEngine != null ) {
// first off, we want to use the *unfiltered* context for the annotations
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc);
call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
}
// initialize the data for this thread if that hasn't been done yet
if ( glcm.get() == null ) {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
if ( call != null && call.vc != null ) {
call.setRefBase(ref);
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs);
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() ) {
Map<String, Object> attrs = new HashMap<String, Object>(call.vc.getAttributes());
attrs.put(VCFConstants.DOWNSAMPLED_KEY, true);
call.vc = VariantContext.modifyAttributes(call.vc, attrs);
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
}
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
// no-call everyone for now
List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
boolean addedAltAllele = false;
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAllele ) {
addedAltAllele = true;
alleles.add(GL.getAlleleA());
alleles.add(GL.getAlleleB());
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(alleles, refAllele, loc);
return new VariantContext("UG_call",
loc.getContig(),
loc.getStart(),
endLoc,
alleles,
genotypes,
VariantContext.NO_NEG_LOG_10PERROR,
null,
null);
}
private static VariantContext GLsToPLs(VariantContext vc) {
if ( vc == null )
return null;
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
}
return VariantContext.modifyGenotypes(vc, calls);
}
/**
* Compute genotypes at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param vc the GL-annotated variant context
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
}
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// did we trigger on the provided track?
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
}
// create the genotypes
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// print out stats if we have a writer
if ( verboseWriter != null )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
DbSNPFeature dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( !UAC.NO_SLOD && bestAFguess != 0 ) {
final boolean DEBUG_SLOD = false;
// the overall lod
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod, reverseLod);
// rescale by a factor of 10
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
attributes.put("SB", Double.valueOf(strandScore));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
if ( annotationEngine != null ) {
// first off, we want to use the *unfiltered* context for the annotations
ReadBackedPileup pileup = null;
if (rawContext.hasExtendedEventPileup())
pileup = rawContext.getExtendedEventPileup();
else if (rawContext.hasBasePileup())
pileup = rawContext.getBasePileup();
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
}
VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
call.setRefBase(refContext.getBase());
return call;
}
private int calculateEndPos(Set<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length
boolean isSNP = true;
for (Allele a : alleles){
if (a.getBaseString().length() != 1) {
isSNP = false;
break;
}
}
int endLoc = loc.getStart();
if ( !isSNP )
endLoc += refAllele.length();
return endLoc;
}
private static boolean isValidDeletionFraction(double d) {
return ( d >= 0.0 && d <= 1.0 );
}
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) {
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
byte ref = refContext.getBase();
if ( !BaseUtils.isRegularBase(ref) )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
// filter the reads (and test for bad pileups)
if ( !filterPileup(stratifiedContexts, badReadPileupFilter) )
return null;
}
return stratifiedContexts;
}
protected static void clearAFarray(double[] AFs) {
for ( int i = 0; i < AFs.length; i++ )
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
double P_of_ref = initialPofRef;
// for each sample that we haven't examined yet
for ( String sample : samples ) {
boolean isCovered = contexts.containsKey(sample);
if ( ignoreCoveredSamples && isCovered )
continue;
int depth = 0;
if (isCovered) {
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if (context.hasBasePileup())
depth = context.getBasePileup().size();
else if (context.hasExtendedEventPileup())
depth = context.getExtendedEventPileup().size();
}
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
}
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
refAllele = allele;
else
altAllele = allele;
}
for (int i = 0; i <= N; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t");
AFline.append(pos);
AFline.append("\t");
AFline.append(refAllele);
AFline.append("\t");
if ( altAllele != null )
AFline.append(altAllele);
else
AFline.append("N/A");
AFline.append("\t");
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
AFline.append("0.00000000\t");
else
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
verboseWriter.println(AFline.toString());
}
verboseWriter.println("P(f>0) = " + PofF);
verboseWriter.println("Qscore = " + phredScaledConfidence);
verboseWriter.println();
}
private boolean filterPileup(Map<String, StratifiedAlignmentContext> stratifiedContexts, BadBaseFilter badBaseFilter) {
int numDeletions = 0, pileupSize = 0;
for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) {
ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
for ( PileupElement p : pileup ) {
final SAMRecord read = p.getRead();
if ( p.isDeletion() ) {
// if it's a good read, count it
if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
(UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) )
numDeletions++;
} else {
if ( !(read instanceof GATKSAMRecord) )
throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass());
GATKSAMRecord GATKrecord = (GATKSAMRecord)read;
GATKrecord.setGoodBases(badBaseFilter, true);
if ( GATKrecord.isGoodBase(p.getOffset()) )
pileupSize++;
}
}
}
// now, test for bad pileups
// in all_bases mode, it doesn't matter
if ( UAC.ALL_BASES_MODE )
return true;
// is there no coverage?
if ( pileupSize == 0 )
return false;
// are there too many deletions in the pileup?
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
(double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION )
return false;
return true;
}
/**
* Filters low quality reads out of the pileup.
* Filters low quality bases out of the SAMRecords.
*/
private class BadReadPileupFilter implements PileupElementFilter {
private class BadBaseFilter implements GATKSAMRecordFilter {
private ReferenceContext refContext;
private final UnifiedArgumentCollection UAC;
public BadReadPileupFilter(ReferenceContext refContext) { this.refContext = refContext; }
public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) {
this.refContext = refContext;
this.UAC = UAC;
}
public boolean allow(PileupElement pileupElement) {
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
public BitSet getGoodBases(final GATKSAMRecord record) {
// all bits are set to false by default
BitSet bitset = new BitSet(record.getReadLength());
// if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue
if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE ||
(!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) {
return bitset;
}
byte[] quals = record.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE )
bitset.set(i);
}
// if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome)
if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) {
GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength()));
byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases();
StringUtil.toUpperCase(bases);
refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases);
}
BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE);
if ( mismatches != null )
bitset.and(mismatches);
return bitset;
}
}
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
}
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
}
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
}
protected void computeAlleleFrequencyPriors(int N) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
for (int i = 1; i <= N; i++) {
double value = UAC.heterozygosity / (double)i;
log10AlleleFrequencyPriors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
}
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
GenotypePriors priors;
switch ( UAC.GLmodel ) {
case SNP:
// use flat priors for GLs
priors = new DiploidSNPGenotypePriors();
break;
case DINDEL:
// create flat priors for Indels, actual priors will depend on event length to be genotyped
priors = new DiploidIndelGenotypePriors();
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return priors;
}
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
GenotypeLikelihoodsCalculationModel glcm;
switch ( UAC.GLmodel ) {
case SNP:
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
case DINDEL:
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
AlleleFrequencyCalculationModel afcm;
switch ( UAC.AFmodel ) {
case EXACT:
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
break;
case GRID_SEARCH:
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return afcm;
}
}

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.VariantContext;

View File

@ -389,7 +389,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker<Set<Bas
public boolean baseIsConfidentRef( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if ( !BaseUtils.isRegularBase(ref.getBase()) )
return false;
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
if ( calls == null || calls.vc == null)
return false;
return ( calls.vc.getNSamples() > 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold );

View File

@ -559,7 +559,7 @@ public class MendelianViolationClassifier extends LocusWalker<MendelianViolation
int conf = 0;
Allele alt = null;
for ( Map.Entry<String,StratifiedAlignmentContext> sEntry : strat.entrySet() ) {
VariantCallContext call = engine.runGenotyper(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE));
VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE));
if ( call != null && call.confidentlyCalled && call.vc != null ) {
if ( call.vc.isSNP() ) {
if ( ! call.vc.getAlternateAllele(0).basesMatch(var.getAlternateAllele(0))) {

View File

@ -38,7 +38,6 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.io.PrintStream;
@ -115,7 +114,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker<Integer, Pair<Lo
//}
//Calculate posterior probabilities
GenotypeLikelihoods G = new GenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
//Check for bad bases and ensure mapping quality

View File

@ -353,7 +353,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
}
//Calculate posterior probabilities
GenotypeLikelihoods G = new GenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(BaseMismatchModel.THREE_STATE);
SAMRecord read; int offset; char base; byte qual; int mapquality; String readname;
//Check for bad bases and ensure mapping quality myself. This works.

View File

@ -200,7 +200,7 @@ public class LocusMismatchWalker extends LocusWalker<String,Integer> implements
}
private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
VariantCallContext calls = ug.runGenotyper(tracker,ref,context);
VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context);
if ( calls == null || calls.vc == null || calls.vc.getNSamples() == 0 || !calls.vc.isSNP() )
return null;
else {

View File

@ -118,7 +118,7 @@ public class SnpCallRateByCoverageWalker extends LocusWalker<List<String>, Strin
AlignmentContext subContext = new AlignmentContext(context.getLocation(), new ReadBackedPileupImpl(context.getLocation(),sub_reads, sub_offsets));
VariantCallContext calls = UG.runGenotyper(tracker, ref, subContext);
VariantCallContext calls = UG.calculateLikelihoodsAndGenotypes(tracker, ref, subContext);
if (calls != null && calls.vc != null && calls.vc.getNSamples() > 0 && calls.confidentlyCalled) {
Genotype evCall = calls.vc.getGenotype(0);

View File

@ -177,7 +177,7 @@ public class ValidationGenotyper extends LocusWalker<ValidationGenotyper.Counted
}
}
else if( evalStatus == VARIANT_STATUS.CALLED && compStatus == VARIANT_STATUS.MISSING ) {
VariantCallContext call = engine.runGenotyper(tracker, ref, context);
VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
if( call != null && call.confidentlyCalled && call.vc != null && call.vc.getType() == VariantContext.Type.NO_VARIATION ) {
counter.numFP = 1L;
if( printStream!= null ) {

View File

@ -107,7 +107,7 @@ public class FindContaminatingReadGroupsWalker extends LocusWalker<Integer, Inte
double altBalance = ((double) altCount)/((double) totalCount);
if (altBalance > 0.70) {
VariantCallContext ugResult = ug.runGenotyper(tracker, ref, context);
VariantCallContext ugResult = ug.calculateLikelihoodsAndGenotypes(tracker, ref, context);
if (ugResult != null && ugResult.vc != null && ugResult.vc.getNSamples() > 0) {
return ugResult.vc.getGenotype(0).isHet();

View File

@ -1,32 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
public enum BaseMismatchModel {
ONE_STATE,
THREE_STATE,
EMPIRICAL
}

View File

@ -1,301 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
import static java.lang.Math.log10;
import java.util.TreeMap;
import java.util.EnumMap;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
// --------------------------------------------------------------------------------------------------------------
//
// Static methods to manipulate machine platforms
//
// --------------------------------------------------------------------------------------------------------------
public enum SequencerPlatform {
SOLEXA, // Solexa / Illumina
ROCHE454, // 454
SOLID, // SOLiD
CG, // Complete Genomics
UNKNOWN // No idea -- defaulting to 1/3
}
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
private static void bind(String s, SequencerPlatform x) {
PLFieldToSequencerPlatform.put(s, x);
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
}
//
// Static list of platforms supported by this system.
//
static {
bind("LS454", SequencerPlatform.ROCHE454);
bind("454", SequencerPlatform.ROCHE454);
bind("ILLUMINA", SequencerPlatform.SOLEXA);
bind("SOLEXA", SequencerPlatform.SOLEXA);
bind("SOLID", SequencerPlatform.SOLID);
bind("ABI_SOLID", SequencerPlatform.SOLID);
bind("CG", SequencerPlatform.CG);
}
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase();
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) {
return PLFieldToSequencerPlatform.get(lcSequencerString);
} else {
return SequencerPlatform.UNKNOWN;
}
}
private static ThreadLocal<SAMRecord> lastReadForPL = new ThreadLocal<SAMRecord>();
private static ThreadLocal<SequencerPlatform> plOfLastRead = new ThreadLocal<SequencerPlatform>();
public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) {
if ( lastReadForPL.get() != read ) {
lastReadForPL.set(read);
SAMReadGroupRecord readGroup = read.getReadGroup();
final String platformName = readGroup == null ? null : readGroup.getPlatform();
plOfLastRead.set(standardizeSequencerPlatform(platformName));
}
return plOfLastRead.get();
}
public int getReadSequencerPlatformIndex( SAMRecord read ) {
return getReadSequencerPlatform(read).ordinal();
}
// --------------------------------------------------------------------------------------------------------------
//
// Static methods to get at the transition tables themselves
//
// --------------------------------------------------------------------------------------------------------------
/**
* A matrix of value i x j -> log10(p) where
*
* i - byte of the miscalled base (i.e., A)
* j - byte of the presumed true base (i.e., C)
* log10p - empirical probability p that A is actually C
*
* The table is available for each technology
*/
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) {
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
log10pTrueGivenMiscall.put(pl, new double[4][4]);
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
misCallProbs[i][j] = log10(p);
}
private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) {
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
if ( logP == 0.0 )
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase));
else
return logP;
}
private static void addSolexa() {
SequencerPlatform pl = SequencerPlatform.SOLEXA;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0);
}
private static void addSOLiD() {
SequencerPlatform pl = SequencerPlatform.SOLID;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0);
}
private static void add454() {
SequencerPlatform pl = SequencerPlatform.ROCHE454;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0);
}
private static void addCG() {
SequencerPlatform pl = SequencerPlatform.CG;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0);
}
private static void addUnknown() {
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
for ( byte b1 : BaseUtils.BASES ) {
for ( byte b2 : BaseUtils.BASES ) {
if ( b1 != b2 )
addMisCall(pl, b1, b2, 1.0/3.0);
}
}
}
static {
addSolexa();
add454();
addSOLiD();
addCG();
addUnknown();
}
// --------------------------------------------------------------------------------------------------------------
//
// The actual objects themselves
//
// --------------------------------------------------------------------------------------------------------------
private boolean raiseErrorOnUnknownPlatform = true;
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
//
// forwarding constructors -- don't do anything at all
//
public EmpiricalSubstitutionProbabilities() { super(); }
public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) {
super();
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
}
public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) {
super();
if ( assumeUnknownPlatformsAreThis != null ) {
raiseErrorOnUnknownPlatform = false;
defaultPlatform = assumeUnknownPlatformsAreThis;
}
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// calculation of p(B|GT)
//
//
// -----------------------------------------------------------------------------------------------------------------
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
SequencerPlatform pl = getReadSequencerPlatform(read);
if ( pl == SequencerPlatform.UNKNOWN ) {
if ( raiseErrorOnUnknownPlatform )
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
else {
pl = defaultPlatform;
//System.out.printf("Using default platform %s", pl);
}
}
//System.out.printf("%s for %s%n", pl, read);
double log10p;
if ( fwdStrand ) {
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
} else {
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
}
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
return log10p;
}
}

View File

@ -1,60 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
/**
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
*/
public class OneStateErrorProbabilities extends FourBaseLikelihoods {
//
// forwarding constructors -- don't do anything at all
//
public OneStateErrorProbabilities() { super(); }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
return 0; // equivalent to e model
}
}

View File

@ -1,63 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import static java.lang.Math.log10;
import net.sf.samtools.SAMRecord;
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
//
// forwarding constructors -- don't do anything at all
//
public ThreeStateErrorProbabilities() { super(); }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
/**
* Simple log10(3) cached value
*/
protected static final double log103 = log10(3.0);
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
return -log103; // equivalent to e / 3 model
}
}

View File

@ -1,122 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
// control the output
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public boolean GENOTYPE_MODE = false;
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
public boolean ALL_BASES_MODE = false;
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;
// control the error modes
@Hidden
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
public String ASSUME_SINGLE_SAMPLE = null;
@Hidden
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null;
// control the various parameters to be used
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 17;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
public int MIN_MAPPING_QUALTY_SCORE = 20;
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
public int MAX_MISMATCHES = 3;
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
public boolean USE_BADLY_MATED_READS = false;
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.GLmodel = GLmodel;
uac.baseModel = baseModel;
uac.heterozygosity = heterozygosity;
uac.GENOTYPE_MODE = GENOTYPE_MODE;
uac.ALL_BASES_MODE = ALL_BASES_MODE;
uac.NO_SLOD = NO_SLOD;
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
uac.defaultPlatform = defaultPlatform;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING;
uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE;
uac.MAX_MISMATCHES = MAX_MISMATCHES;
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
return uac;
}
}

View File

@ -1,681 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.dbsnp.DbSNPFeature;
import java.io.PrintStream;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import net.sf.picard.reference.IndexedFastaSequenceFile;
public class UnifiedGenotyperEngine {
public static final String TRIGGER_TRACK_NAME = "trigger";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
// the unified argument collection
private UnifiedArgumentCollection UAC = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// the model used for calculating genotypes
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private double[] log10AlleleFrequencyPriors;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
// the priors object
private GenotypePriors genotypePriors;
// samples in input
private Set<String> samples = new TreeSet<String>();
// the various loggers and writers
private Logger logger = null;
private PrintStream verboseWriter = null;
// fasta reference reader to supplement the edges of the reference sequence for long reads
private IndexedFastaSequenceFile referenceReader;
// number of chromosomes (2 * samples) in input
private int N;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
private static final int MISMATCH_WINDOW_SIZE = 20;
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
// get the number of samples
// if we're supposed to assume a single sample, do so
samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader());
initialize(toolkit, UAC, null, null, null, samples.size());
}
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set<String> samples) {
this.samples = new TreeSet<String>(samples);
initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size());
}
private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
this.UAC = UAC;
this.logger = logger;
this.verboseWriter = verboseWriter;
this.annotationEngine = engine;
N = 2 * numSamples;
log10AlleleFrequencyPriors = new double[N+1];
computeAlleleFrequencyPriors(N);
genotypePriors = createGenotypePriors(UAC);
filter.add(LOW_QUAL_FILTER_NAME);
referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile);
}
/**
* Compute full calls at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if ( vc == null )
return null;
VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
vcc.vc = GLsToPLs(vcc.vc);
return vcc;
}
/**
* Compute GLs at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantContext object
*/
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
return GLsToPLs(vc);
}
private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) {
if ( stratifiedContexts == null )
return null;
// initialize the data for this thread if that hasn't been done yet
if ( glcm.get() == null ) {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs);
return createVariantContextFromLikelihoods(refContext, refAllele, GLs);
}
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
// no-call everyone for now
List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
boolean addedAltAllele = false;
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAllele ) {
addedAltAllele = true;
alleles.add(GL.getAlleleA());
alleles.add(GL.getAlleleB());
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth());
attributes.put(likelihoods.getKey(), likelihoods.getAsString());
genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(alleles, refAllele, loc);
return new VariantContext("UG_call",
loc.getContig(),
loc.getStart(),
endLoc,
alleles,
genotypes,
VariantContext.NO_NEG_LOG_10PERROR,
null,
null);
}
private static VariantContext GLsToPLs(VariantContext vc) {
if ( vc == null )
return null;
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
for ( Map.Entry<String, Genotype> genotype : vc.getGenotypes().entrySet() ) {
HashMap<String, Object> attributes = new HashMap<String, Object>(genotype.getValue().getAttributes());
attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector()));
calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes));
}
return VariantContext.modifyGenotypes(vc, calls);
}
/**
* Compute genotypes at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @param vc the GL-annotated variant context
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext);
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc);
}
private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 )
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0);
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// did we trigger on the provided track?
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF);
}
// create the genotypes
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// print out stats if we have a writer
if ( verboseWriter != null )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
DbSNPFeature dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( !UAC.NO_SLOD && bestAFguess != 0 ) {
final boolean DEBUG_SLOD = false;
// the overall lod
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
// strand score is max bias between forward and reverse strands
double strandScore = Math.max(forwardLod, reverseLod);
// rescale by a factor of 10
strandScore *= 10.0;
//logger.debug(String.format("SLOD=%f", strandScore));
attributes.put("SB", Double.valueOf(strandScore));
}
GenomeLoc loc = refContext.getLocus();
int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc);
VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc,
vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
if ( annotationEngine != null ) {
// first off, we want to use the *unfiltered* context for the annotations
ReadBackedPileup pileup = null;
if (rawContext.hasExtendedEventPileup())
pileup = rawContext.getExtendedEventPileup();
else if (rawContext.hasBasePileup())
pileup = rawContext.getBasePileup();
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element.
}
VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
call.setRefBase(refContext.getBase());
return call;
}
private int calculateEndPos(Set<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length
boolean isSNP = true;
for (Allele a : alleles){
if (a.getBaseString().length() != 1) {
isSNP = false;
break;
}
}
int endLoc = loc.getStart();
if ( !isSNP )
endLoc += refAllele.length();
return endLoc;
}
private static boolean isValidDeletionFraction(double d) {
return ( d >= 0.0 && d <= 1.0 );
}
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) {
BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC);
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
byte ref = refContext.getBase();
if ( !BaseUtils.isRegularBase(ref) )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
// filter the reads (and test for bad pileups)
if ( !filterPileup(stratifiedContexts, badReadPileupFilter) )
return null;
}
return stratifiedContexts;
}
protected static void clearAFarray(double[] AFs) {
for ( int i = 0; i < AFs.length; i++ )
AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
double P_of_ref = initialPofRef;
// for each sample that we haven't examined yet
for ( String sample : samples ) {
boolean isCovered = contexts.containsKey(sample);
if ( ignoreCoveredSamples && isCovered )
continue;
int depth = 0;
if (isCovered) {
AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE);
if (context.hasBasePileup())
depth = context.getBasePileup().size();
else if (context.hasExtendedEventPileup())
depth = context.getExtendedEventPileup().size();
}
P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
}
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
refAllele = allele;
else
altAllele = allele;
}
for (int i = 0; i <= N; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t");
AFline.append(pos);
AFline.append("\t");
AFline.append(refAllele);
AFline.append("\t");
if ( altAllele != null )
AFline.append(altAllele);
else
AFline.append("N/A");
AFline.append("\t");
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
AFline.append("0.00000000\t");
else
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
verboseWriter.println(AFline.toString());
}
verboseWriter.println("P(f>0) = " + PofF);
verboseWriter.println("Qscore = " + phredScaledConfidence);
verboseWriter.println();
}
private boolean filterPileup(Map<String, StratifiedAlignmentContext> stratifiedContexts, BadBaseFilter badBaseFilter) {
int numDeletions = 0, pileupSize = 0;
for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) {
ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
for ( PileupElement p : pileup ) {
final SAMRecord read = p.getRead();
if ( p.isDeletion() ) {
// if it's a good read, count it
if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE &&
(UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) )
numDeletions++;
} else {
if ( !(read instanceof GATKSAMRecord) )
throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass());
GATKSAMRecord GATKrecord = (GATKSAMRecord)read;
GATKrecord.setGoodBases(badBaseFilter, true);
if ( GATKrecord.isGoodBase(p.getOffset()) )
pileupSize++;
}
}
}
// now, test for bad pileups
// in all_bases mode, it doesn't matter
if ( UAC.ALL_BASES_MODE )
return true;
// is there no coverage?
if ( pileupSize == 0 )
return false;
// are there too many deletions in the pileup?
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
(double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION )
return false;
return true;
}
/**
* Filters low quality bases out of the SAMRecords.
*/
private class BadBaseFilter implements GATKSAMRecordFilter {
private ReferenceContext refContext;
private final UnifiedArgumentCollection UAC;
public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) {
this.refContext = refContext;
this.UAC = UAC;
}
public BitSet getGoodBases(final GATKSAMRecord record) {
// all bits are set to false by default
BitSet bitset = new BitSet(record.getReadLength());
// if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue
if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE ||
(!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) {
return bitset;
}
byte[] quals = record.getBaseQualities();
for (int i = 0; i < quals.length; i++) {
if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE )
bitset.set(i);
}
// if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome)
if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) {
GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength()));
byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases();
StringUtil.toUpperCase(bases);
refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases);
}
BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE);
if ( mismatches != null )
bitset.and(mismatches);
return bitset;
}
}
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
}
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
}
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
}
protected void computeAlleleFrequencyPriors(int N) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
for (int i = 1; i <= N; i++) {
double value = UAC.heterozygosity / (double)i;
log10AlleleFrequencyPriors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
}
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
GenotypePriors priors;
switch ( UAC.GLmodel ) {
case SNP:
// use flat priors for GLs
priors = new DiploidSNPGenotypePriors();
break;
case DINDEL:
// create flat priors for Indels, actual priors will depend on event length to be genotyped
priors = new DiploidIndelGenotypePriors();
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return priors;
}
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
GenotypeLikelihoodsCalculationModel glcm;
switch ( UAC.GLmodel ) {
case SNP:
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
case DINDEL:
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
AlleleFrequencyCalculationModel afcm;
switch ( UAC.AFmodel ) {
case EXACT:
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
break;
case GRID_SEARCH:
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return afcm;
}
}

View File

@ -1,236 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.*;
import java.io.PrintStream;
/**
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
* multi-sample data. The user can choose from several different incorporated calculation models.
*/
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
protected PrintStream metricsWriter = null;
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>();
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { "Standard" };
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
/**
* Inner class for collecting output statistics from the UG
*/
public static class UGStatistics {
/** The total number of passes examined -- i.e., the number of map calls */
long nBasesVisited = 0;
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
long nBasesCallable = 0;
/** The number of bases called confidently (according to user threshold), either ref or other */
long nBasesCalledConfidently = 0;
/** The number of bases for which calls were emitted */
long nCallsMade = 0;
/** The total number of extended events encountered */
long nExtendedEvents = 0;
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesCallable); }
}
/**
* Initialize the samples, output, and genotype calculation model
*
**/
public void initialize() {
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
Set<String> samples = new TreeSet<String>();
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);
// initialize the header
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
}
private Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
// also, check to see whether comp rods were included
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
}
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
}
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY));
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
return headerInfo;
}
/**
* Compute at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
}
public UGStatistics reduceInit() { return new UGStatistics(); }
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
lhs.nBasesCallable += rhs.nBasesCallable;
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
lhs.nBasesVisited += rhs.nBasesVisited;
lhs.nCallsMade += rhs.nCallsMade;
return lhs;
}
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
// we get a point for reaching reduce
sum.nBasesVisited++;
// can't call the locus because of no coverage
if ( value == null )
return sum;
// A call was attempted -- the base was potentially callable
sum.nBasesCallable++;
// the base was confidently callable
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
// can't make a confident variant call here
if ( value.vc == null )
return sum;
try {
// we are actually making a call
sum.nCallsMade++;
writer.add(value.vc, value.refBase);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum;
}
public void onTraversalDone(UGStatistics sum) {
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
logger.info(String.format("Actual calls made %d", sum.nCallsMade));
if ( metricsWriter != null ) {
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));
metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable));
metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade));
}
}
}

View File

@ -1,64 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.VariantContext;
/**
* Created by IntelliJ IDEA.
* User: depristo, ebanks
* Date: Jan 22, 2010
* Time: 2:25:19 PM
*
* Useful helper class to communicate the results of calculateGenotype to framework
*/
public class VariantCallContext {
public VariantContext vc = null;
public byte refBase;
// Was the site called confidently, either reference or variant?
public boolean confidentlyCalled = false;
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
this.vc = vc;
this.confidentlyCalled = confidentlyCalledP;
}
VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) {
this.vc = vc;
this.refBase = ref;
this.confidentlyCalled = confidentlyCalledP;
}
// blank variant context => we're a ref site
VariantCallContext(boolean confidentlyCalledP) {
this.confidentlyCalled = confidentlyCalledP;
}
public void setRefBase(byte ref) {
this.refBase = ref;
}
}

View File

@ -14,7 +14,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
@Test
public void testBasic() {
logger.warn("Executing testIsBetween");
Assert.assertEquals(DELTA, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3);
Assert.assertEquals(DELTA, DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3);
}
@ -44,13 +44,13 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
}
private void testPriorsFromHet(double h, double homRef, double het, double homVar) {
double[] vals = DiploidGenotypePriors.heterozygosity2DiploidProbabilities(h);
double[] vals = DiploidSNPGenotypePriors.heterozygosity2DiploidProbabilities(h);
Assert.assertEquals(vals[0], homRef, DELTA);
Assert.assertEquals(vals[1], het, DELTA);
Assert.assertEquals(vals[2], homVar, DELTA);
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA);
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HetProbability(h), het, DELTA);
Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA);
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA);
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HetProbability(h), het, DELTA);
Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA);
}
//
@ -71,9 +71,9 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
private void testGenotypePriors(char ref, double h, double[] array) {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double val = 0.0;
if ( g.isHomRef((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomRefProbability(h);
if ( g.isHet() ) val = DiploidGenotypePriors.heterozygosity2HetProbability(h);
if ( g.isHomVar((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomVarProbability(h);
if ( g.isHomRef((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h);
if ( g.isHet() ) val = DiploidSNPGenotypePriors.heterozygosity2HetProbability(h);
if ( g.isHomVar((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h);
val = log10(val);
double e = array[g.ordinal()];
@ -100,7 +100,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest {
}
private void testPolarizedGenotypePriors(char ref, double h, double pRefError, double[] array) {
DiploidGenotypePriors priors = new DiploidGenotypePriors((byte)ref, h, pRefError);
DiploidSNPGenotypePriors priors = new DiploidSNPGenotypePriors((byte)ref, h, pRefError);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double val = Math.pow(10, priors.getPrior(g));
double e = array[g.ordinal()];

View File

@ -22,27 +22,27 @@ public class
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testMultiSamplePilot1Joint() {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("1dde074afe990fd38941de18bd35e188"));
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
Arrays.asList("aeef287589c010ff33ca5eee116c64f2"));
executeTest("testMultiSamplePilot1", spec);
}
@Test
public void testMultiSamplePilot2Joint() {
public void testMultiSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1,
Arrays.asList("eeed5d52f4bdeed2c73ad5d202cbba26"));
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
Arrays.asList("59f2869835964eaa03fb9838b7d63bca"));
executeTest("testMultiSamplePilot2", spec);
}
@Test
public void testSingleSamplePilot2Joint() {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("c34a080e6799ef84f73c6e8d16f7d6ea"));
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
Arrays.asList("40b360318a7d0be8ae8f482816cb24c8"));
executeTest("testSingleSamplePilot2", spec);
}
// --------------------------------------------------------------------------------------------------------------
@ -51,7 +51,7 @@ public class
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "dfbdccf53668b95e3490c198cbad77e8";
private final static String COMPRESSED_OUTPUT_MD5 = "c97fde4b21088eb0d3a5fa2136451ff7";
@Test
public void testCompressedOutput() {
@ -78,7 +78,7 @@ public class
@Test
public void testParallelization() {
String md5 = "ce616a641a2e00d29510e99c6606d151";
String md5 = "376069f464f0e385c409bf6b5e67cc03";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -105,11 +105,12 @@ public class
@Test
public void testParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-genotype", "243630a53c3881e23d89ce6bec448109" );
e.put( "-all_bases", "feae0bc485e01bf102d190c7234f3c59" );
e.put( "--min_base_quality_score 26", "48837ae22a75002d448806f6b46dd3b3" );
e.put( "--min_mapping_quality_score 26", "d388e847b655abf56a7fd945bdff5dee" );
e.put( "--max_mismatches_in_40bp_window 5", "416ba33cf1cfc0b4b4a2ad3256c8b56b" );
e.put( "-genotype", "707821d365ea146c74e512adfa315610" );
e.put( "-all_bases", "bf394fc7ee5d87bb7096a301a0a5ab0a" );
e.put( "--min_base_quality_score 26", "f0d3b8e337c9e7a00b457211f3739bda" );
e.put( "--min_mapping_quality_score 26", "f53db471200a23713e08ac4b357e090b" );
e.put( "--max_mismatches_in_40bp_window 5", "21b21d1c15d75db671821c4f119d6318" );
e.put( "--p_nonref_model GRID_SEARCH", "69879139904c2f51a20291bb73763def" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -123,12 +124,12 @@ public class
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("aedfce6b54d3138e0459f62421185760"));
Arrays.asList("69879139904c2f51a20291bb73763def"));
executeTest("testConfidence1", spec1);
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("b442c0fd6e0d3365e8bc2d9149eeb6f5"));
Arrays.asList("b928c7c961d0fd7c8a2b6bc9c53be25e"));
executeTest("testConfidence2", spec2);
}
@ -140,8 +141,8 @@ public class
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "2588b41cd3debfc2d58fdf6b463bcc07" );
e.put( 1.0 / 1850, "18fa98796906658402cedd6cc3f28be6" );
e.put( 0.01, "73e350d0626a94d224de2f76e621d034" );
e.put( 1.0 / 1850, "81c1e24b956407a4e8d6385da8aadd82" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -160,8 +161,7 @@ public class
@Test
public void testOtherBaseCallModel() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "one_state", "62ee552b32d9193b700c1dd8e3eeb4b9" );
e.put( "three_state", "f2971d0964780b9abfd38e860080c342" );
e.put( "three_state", "57906723c8d483d31504437269c9f59a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -184,7 +184,7 @@ public class
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("4469242939216eac5eb6b6b73ae6a509"));
Arrays.asList("68bb882fba0d331c8f5bdf774e4e09a4"));
executeTest(String.format("testMultiTechnologies"), spec);
}

View File

@ -4,7 +4,6 @@ import java.io.File
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.samtools._
@ -39,14 +38,7 @@ class VariantCalling(yaml: File,gatkJar: File) {
var ug = new UnifiedGenotyper with StandardCommandLineGATK
ug.analysisName = "UnifiedGenotyper"
ug.input_file = bams
ug.group :+= "Standard"
ug.out = output
ug.min_base_quality_score = Some(10)
ug.min_mapping_quality_score = Some(10)
ug.cap_base_quality_by_mapping_quality = true
ug.standard_min_confidence_threshold_for_emitting = Some(10)
ug.standard_min_confidence_threshold_for_calling = Some(30)
ug.trigger_min_confidence_threshold_for_calling = Some(0)
ug.downsample_to_coverage = Some(300)
ug.dt = Some(DownsampleType.BY_SAMPLE)
ug.scatterCount = 50