Initial version of refactored Unified Genotyper. Using SNP genotype likelihoods and GRID_SEARCH AF estimation models, achieves the exact same results as original UG on 1-2 samples with the exception of strand bias (not implemented yet); other than that I have no idea. Needs tons more testing. Do not use. For Guillermo only.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4377 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-09-29 08:42:25 +00:00
parent 6df7f9318f
commit 0ec07ad99a
20 changed files with 3539 additions and 0 deletions

View File

@ -0,0 +1,355 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
import org.broad.tribble.vcf.VCFConstants;
import java.io.PrintStream;
import java.util.*;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
*/
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
EXACT,
GRID_SEARCH
}
protected int N;
protected AlleleFrequencyMatrix AFMatrix;
protected Set<BiallelicGenotypeLikelihoods> refCalls;
protected Logger logger;
protected PrintStream verboseWriter;
private int minAlleleFrequencyToTest;
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
this.N = N;
this.logger = logger;
this.verboseWriter = verboseWriter;
AFMatrix = new AlleleFrequencyMatrix(N);
refCalls = new HashSet<BiallelicGenotypeLikelihoods>();
}
/**
* Must be overridden by concrete subclasses
* @param tracker rod data
* @param ref reference context
* @param GLs genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
*/
public abstract void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors);
/**
* Can be overridden by concrete subclasses
* @param contexts alignment contexts
* @param GLs genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood) {
initializeAFMatrix(GLs);
// increment the grid
for (int i = 1; i <= AFofMaxLikelihood; i++) {
// add one more alternate allele
AFMatrix.incrementFrequency();
}
return generateCalls(contexts, GLs, AFofMaxLikelihood);
}
// TODO: get rid of this optimization, it is wrong!
protected int getMinAlleleFrequencyToTest() {
return minAlleleFrequencyToTest;
}
protected void setMinAlleleFrequencyToTest(int minAF) {
minAlleleFrequencyToTest = minAF;
}
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
int frequency) {
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
// first, the potential alt calls
for ( String sample : AFMatrix.getSamples() ) {
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
Allele alleleA = GL.getAlleleA();
Allele alleleB = GL.getAlleleB();
// set the genotype and confidence
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(frequency, sample);
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
myAlleles.add(alleleA);
myAlleles.add(alleleA);
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
myAlleles.add(alleleA);
myAlleles.add(alleleB);
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
myAlleles.add(alleleB);
myAlleles.add(alleleB);
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
}
// now, the clearly ref calls
for ( BiallelicGenotypeLikelihoods GL : refCalls ) {
String sample = GL.getSample();
Allele ref = GL.getAlleleA();
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
myAlleles.add(ref);
myAlleles.add(ref);
HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
}
return calls;
}
protected void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
refCalls.clear();
AFMatrix.clear();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( isClearRefCall(GL) ) {
refCalls.add(GL);
} else {
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
}
}
}
private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) {
if ( GL.getAlleleA().isNonReference() )
return false;
double[] likelihoods = GL.getLikelihoods();
return ( likelihoods[0] > likelihoods[1] && likelihoods[0] > likelihoods[2]);
}
protected class CalculatedAlleleFrequency {
public double log10PNonRef;
public int altAlleles;
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
this.log10PNonRef = log10PNonRef;
this.altAlleles = altAlleles;
}
}
private enum GenotypeType { AA, AB, BB }
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
protected static class AlleleFrequencyMatrix {
private double[][] matrix; // allele frequency matrix
private int[] indexes; // matrix to maintain which genotype is active
private int maxN; // total possible frequencies in data
private int frequency; // current frequency
// data structures necessary to maintain a list of the best genotypes and their scores
private ArrayList<String> samples = new ArrayList<String>();
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
public AlleleFrequencyMatrix(int N) {
maxN = N;
matrix = new double[N][3];
indexes = new int[N];
clear();
}
public List<String> getSamples() { return samples; }
public void clear() {
frequency = 0;
for (int i = 0; i < maxN; i++)
indexes[i] = 0;
samples.clear();
samplesToGenotypesPerAF.clear();
}
public void setLikelihoods(double AA, double AB, double BB, String sample) {
int index = samples.size();
samples.add(sample);
matrix[index][GenotypeType.AA.ordinal()] = AA;
matrix[index][GenotypeType.AB.ordinal()] = AB;
matrix[index][GenotypeType.BB.ordinal()] = BB;
}
public void setLikelihoods(double[] GLs, String sample) {
int index = samples.size();
samples.add(sample);
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
}
public void incrementFrequency() {
int N = samples.size();
if ( frequency == 2 * N )
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
frequency++;
double greedy = VALUE_NOT_CALCULATED;
int greedyIndex = -1;
for (int i = 0; i < N; i++) {
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
greedyIndex = i;
}
}
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
greedyIndex = i;
}
// note that we currently don't bother with breaking ties between samples
// (which would be done by looking at the HOM_VAR value) because it's highly
// unlikely that a collision will both occur and that the difference will
// be significant at HOM_VAR...
}
// if this person is already hom var, he can't add another alternate allele
// so we can ignore that case
}
if ( greedyIndex == -1 )
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
indexes[greedyIndex] = GenotypeType.BB.ordinal();
else
indexes[greedyIndex] = GenotypeType.AB.ordinal();
}
public double getLikelihoodsOfFrequency() {
double likelihoods = 0.0;
int N = samples.size();
for (int i = 0; i < N; i++)
likelihoods += matrix[i][indexes[i]];
/*
System.out.println(frequency);
for (int i = 0; i < N; i++) {
for (int j=0; j < 3; j++) {
System.out.print(String.valueOf(matrix[i][j]));
System.out.print(indexes[i] == j ? "* " : " ");
}
System.out.println();
}
System.out.println(likelihoods);
System.out.println();
*/
recordGenotypes();
return likelihoods;
}
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
return samplesToGenotypesPerAF.get(frequency).get(sample);
}
private void recordGenotypes() {
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
int index = 0;
for ( String sample : samples ) {
int genotype = indexes[index];
double score;
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
// if the max value is for the most likely genotype, we can compute next vs. next best
if ( genotype == maxEntry ) {
if ( genotype == GenotypeType.AA.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
else if ( genotype == GenotypeType.AB.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
else // ( genotype == GenotypeType.HOM.ordinal() )
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
}
// otherwise, we need to calculate the probability of the genotype
else {
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
double chosenGenotype = normalized[genotype];
score = -1.0 * Math.log10(1.0 - chosenGenotype);
}
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
index++;
}
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
}
}
}

View File

@ -0,0 +1,32 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
public enum BaseMismatchModel {
ONE_STATE,
THREE_STATE,
EMPIRICAL
}

View File

@ -0,0 +1,132 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Allele;
public class BiallelicGenotypeLikelihoods {
private String sample;
private double[] GLs;
private double[] GPs;
private Allele A, B;
/**
* Create a new object for sample with given alleles and genotype likelihoods
*
* @param sample sample name
* @param A allele A
* @param B allele B
* @param log10AALikelihoods AA likelihoods
* @param log10ABLikelihoods AB likelihoods
* @param log10BBLikelihoods BB likelihoods
*/
public BiallelicGenotypeLikelihoods(String sample,
Allele A,
Allele B,
double log10AALikelihoods,
double log10ABLikelihoods,
double log10BBLikelihoods) {
this.sample = sample;
this.A = A;
this.B = B;
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
this.GPs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
}
/**
* Create a new object for sample with given alleles and genotype likelihoods & posteriors
*
* @param sample sample name
* @param A allele A
* @param B allele B
* @param log10AALikelihoods AA likelihoods
* @param log10ABLikelihoods AB likelihoods
* @param log10BBLikelihoods BB likelihoods
* @param log10AAPosteriors AA posteriors
* @param log10ABPosteriors AB posteriors
* @param log10BBPosteriors BB posteriors
*/
public BiallelicGenotypeLikelihoods(String sample,
Allele A,
Allele B,
double log10AALikelihoods,
double log10ABLikelihoods,
double log10BBLikelihoods,
double log10AAPosteriors,
double log10ABPosteriors,
double log10BBPosteriors) {
this.sample = sample;
this.A = A;
this.B = B;
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
this.GPs = new double[]{log10AAPosteriors, log10ABPosteriors, log10BBPosteriors};
}
public String getSample() {
return sample;
}
public double getAALikelihoods() {
return GLs[0];
}
public double getABLikelihoods() {
return GLs[1];
}
public double getBBLikelihoods() {
return GLs[2];
}
public double[] getLikelihoods() {
return GLs;
}
public double getAAPosteriors() {
return GPs[0];
}
public double getABPosteriors() {
return GPs[1];
}
public double getBBPosteriors() {
return GPs[2];
}
public double[] getPosteriors() {
return GPs;
}
public Allele getAlleleA() {
return A;
}
public Allele getAlleleB() {
return B;
}
}

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broad.tribble.util.variantcontext.Allele;
import java.util.*;
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
}
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, StratifiedAlignmentContext> contexts,
StratifiedAlignmentContext.StratifiedContextType contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs) {
// TODO: check to make sure the priors instanceof a valid priors class
// TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
// TODO: fill me in
//GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...));
}
// TODO: return the reference Allele
return null;
}
}

View File

@ -0,0 +1,534 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(G | D) = P(G) * P(D | G)
*
* where
*
* P(D | G) = sum_i log10 P(bi | G)
*
* and
*
* P(bi | G) = 1 - P(error | q1) if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for homozygous genotypes and for heterozygous genotypes:
*
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
*
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
*
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected boolean enableCacheFlag = true;
protected boolean VERBOSE = false;
//
// The fundamental data arrays associated with a Genotype Likelhoods object
//
protected double[] log10Likelihoods = null;
protected double[] log10Posteriors = null;
protected DiploidSNPGenotypePriors priors = null;
protected FourBaseLikelihoods fourBaseLikelihoods = null;
/**
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
*
* @param m base model
*/
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m) {
this.priors = new DiploidSNPGenotypePriors();
initialize(m, null);
}
/**
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
*
* @param m base model
* @param pl default platform
*/
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
this.priors = new DiploidSNPGenotypePriors();
initialize(m, pl);
}
/**
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
*
* @param m base model
* @param priors priors
*/
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors) {
this.priors = priors;
initialize(m, null);
}
/**
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
*
* @param m base model
* @param priors priors
* @param pl default platform
*/
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
this.priors = priors;
initialize(m, pl);
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone();
c.priors = priors;
c.log10Likelihoods = log10Likelihoods.clone();
c.log10Posteriors = log10Posteriors.clone();
c.fourBaseLikelihoods = (FourBaseLikelihoods)fourBaseLikelihoods.clone();
return c;
}
protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
fourBaseLikelihoods = FourBaseLikelihoodsFactory.makeFourBaseLikelihoods(m, pl);
setToZero();
}
protected void setToZero() {
log10Likelihoods = zeros.clone(); // likelihoods are all zeros
log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors
}
public void setVerbose(boolean v) {
VERBOSE = v;
fourBaseLikelihoods.setVerbose(v);
}
public boolean isVerbose() {
return VERBOSE;
}
public int getMinQScoreToInclude() {
return fourBaseLikelihoods.getMinQScoreToInclude();
}
public void setMinQScoreToInclude(int minQScoreToInclude) {
fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude);
}
/**
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
/**
* Returns the likelihood associated with DiploidGenotype g
* @param g genotype
* @return log10 likelihood as a double
*/
public double getLikelihood(DiploidGenotype g) {
return getLikelihoods()[g.ordinal()];
}
/**
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return raw log10 (not-normalized posteriors) as a double array
*/
public double[] getPosteriors() {
return log10Posteriors;
}
/**
* Returns the posterior associated with DiploidGenotype g
* @param g genotpe
* @return raw log10 (not-normalized posteror) as a double
*/
public double getPosterior(DiploidGenotype g) {
return getPosteriors()[g.ordinal()];
}
/**
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return normalized posterors as a double array
*/
public double[] getNormalizedPosteriors() {
double[] normalized = new double[log10Posteriors.length];
double sum = 0.0;
// for precision purposes, we need to add (or really subtract, since everything is negative)
// the largest posterior value from all entries so that numbers don't get too small
double maxValue = log10Posteriors[0];
for (int i = 1; i < log10Posteriors.length; i++) {
if ( maxValue < log10Posteriors[i] )
maxValue = log10Posteriors[i];
}
// collect the posteriors
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double posterior = Math.pow(10, getPosterior(g) - maxValue);
normalized[g.ordinal()] = posterior;
sum += posterior;
}
// normalize
for (int i = 0; i < normalized.length; i++)
normalized[i] /= sum;
return normalized;
}
public DiploidSNPGenotypePriors getPriorObject() {
return priors;
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors.getPriors();
}
/**
* Sets the priors
* @param priors priors
*/
public void setPriors(DiploidSNPGenotypePriors priors) {
this.priors = priors;
log10Posteriors = zeros.clone();
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i];
}
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g genotype
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
/**
* Simple function to overload to control the caching of genotype likelihood outputs.
* By default the function trues true -- do enable caching. If you are experimenting with an
* complex calcluation of P(B | G) and caching might not work correctly for you, overload this
* function and return false, so the super() object won't try to cache your GL calculations.
*
* @return true if caching should be enabled, false otherwise
*/
public boolean cacheIsEnabled() {
return enableCacheFlag;
}
public void setEnableCacheFlag(boolean enable) {
enableCacheFlag = enable;
}
public int add(ReadBackedPileup pileup) {
return add(pileup, false, false);
}
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
int n = 0;
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
byte base = p.getBase();
if ( ! ignoreBadBases || ! badBase(base) ) {
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
n += add(base, qual, p.getRead(), p.getOffset());
}
}
return n;
}
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
DiploidSNPGenotypeLikelihoods gl;
if ( cacheIsEnabled() ) {
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
} else {
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
}
} else {
gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
}
// for bad bases, there are no likelihoods
if ( gl == null )
return 0;
double[] likelihoods = gl.getLikelihoods();
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double likelihood = likelihoods[g.ordinal()];
if ( VERBOSE ) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
}
log10Likelihoods[g.ordinal()] += likelihood;
log10Posteriors[g.ordinal()] += likelihood;
}
return 1;
}
static DiploidSNPGenotypeLikelihoods[][][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
}
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
if ( gl == null )
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
observedBase, qualityScore, ploidy, read));
return gl;
}
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
return gl;
}
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
byte observedBase, byte qualityScore, int ploidy,
SAMRecord read, DiploidSNPGenotypeLikelihoods val ) {
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
int j = qualityScore;
int k = ploidy;
int x = strandIndex(! read.getReadNegativeStrandFlag() );
cache[m][a][i][j][k][x] = val;
}
protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
int j = qualityScore;
int k = ploidy;
int x = strandIndex(! read.getReadNegativeStrandFlag() );
return cache[m][a][i][j][k][x];
}
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
FourBaseLikelihoods fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
if ( fbl == null )
return null;
double[] fbLikelihoods = fbl.getLog10Likelihoods();
try {
DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone();
gl.setToZero();
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
double ploidyAdjustment = log10(FIXED_PLOIDY);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
double p_base = 0.0;
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
double likelihood = log10(p_base);
gl.log10Likelihoods[g.ordinal()] += likelihood;
gl.log10Posteriors[g.ordinal()] += likelihood;
}
if ( VERBOSE ) {
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
System.out.println();
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
System.out.println();
}
return gl;
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public static int strandIndex(boolean fwdStrand) {
return fwdStrand ? 0 : 1;
}
/**
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
* is considered bad if:
*
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
*
* @param observedBase observed base
* @return true if the base is a bad base
*/
protected boolean badBase(byte observedBase) {
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
}
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for (DiploidGenotype g : DiploidGenotype.values()) {
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
priors.validate(throwException);
for ( DiploidGenotype g : DiploidGenotype.values() ) {
String bad = null;
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
} else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) {
bad = String.format("Posterior %f is badly formed", log10Posteriors[i]);
}
if ( bad != null ) {
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
//
// Constant static data
//
final static double[] zeros = new double[DiploidGenotype.values().length];
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
zeros[g.ordinal()] = 0.0;
}
}
}

View File

@ -0,0 +1,257 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import java.util.Arrays;
public class DiploidSNPGenotypePriors implements GenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//
// Constants and static information
//
// --------------------------------------------------------------------------------------------------------------
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
public static final double CEU_HETEROZYGOSITY = 1e-3;
public static final double YRI_HETEROZYGOSITY = 1.0 / 850;
/**
* Default value of the prob of seeing a reference error. Used to calculation the
* chance of seeing a true B/C het when the reference is A, which we assume is the product
* of the ref error rate and the het. Default value is Q60
*/
public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
// --------------------------------------------------------------------------------------------------------------
//
// Diploid priors
//
// --------------------------------------------------------------------------------------------------------------
private double[] priors = null;
// todo -- fix me when this issue is resolved
public static final boolean requirePriorSumToOne = false;
/**
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
*/
public DiploidSNPGenotypePriors() {
priors = flatPriors.clone();
}
/**
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
* base ref
*
* @param ref
* @param heterozygosity
* @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A
*/
public DiploidSNPGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) {
priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype);
}
/**
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
*
* @param log10Priors
*/
public DiploidSNPGenotypePriors(double[] log10Priors) {
priors = log10Priors.clone();
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors;
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
public double getHeterozygosity() { return HUMAN_HETEROZYGOSITY; }
public boolean validate(boolean throwException) {
try {
if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
}
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
// --------------------------------------------------------------------------------------------------------------
//
// Static functionality
//
// --------------------------------------------------------------------------------------------------------------
/**
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
* value, as elements 0, 1, and 2 of a double[], respectively
*
* @param h the heterozygosity [probability of a base being heterozygous]
*/
@Deprecated
public static double[] heterozygosity2DiploidProbabilities(double h) {
double[] pdbls = new double[3];
pdbls[0] = heterozygosity2HomRefProbability(h);
pdbls[1] = heterozygosity2HetProbability(h);
pdbls[2] = heterozygosity2HomVarProbability(h);
return pdbls;
}
/**
*
* @param h
* @return
*/
public static double heterozygosity2HomRefProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
double v = 1.0 - (3.0 * h / 2.0);
if (MathUtils.isNegative(v)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return v;
}
public static double heterozygosity2HetProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return h;
}
public static double heterozygosity2HomVarProbability(double h) {
if (MathUtils.isNegative(h)) {
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
}
return h / 2.0;
}
/**
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
* appropriately.
*
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
* then this sets the priors to:
*
* AA = hom-ref
* AC = AG = AT = (het - pTriStateGenotype) / 3
* CC = GG = TT = hom-var / 3
* CG = CT = GT = pTriStateGenotype / 3
*
* So that we get:
*
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
* hom-ref + het + hom-var
* = 1
*
* @param ref
* @param heterozyosity
* @param pRefError
*/
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
}
double pTriStateGenotype = heterozyosity * pRefError;
// if ( pTriStateGenotype >= heterozyosity ) {
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
// }
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
double pHet = heterozygosity2HetProbability(heterozyosity);
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
}
double[] priors = new double[DiploidGenotype.values().length];
for ( DiploidGenotype g : DiploidGenotype.values() ) {
double POfG;
final double nOnRefHets = 3;
final double nOffRefHets = 3;
final double nHomVars = 3;
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
else { POfG = pTriStateGenotype / nOffRefHets; }
priors[g.ordinal()] = Math.log10(POfG);
}
return priors;
}
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
}
}
}

View File

@ -0,0 +1,301 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
import static java.lang.Math.log10;
import java.util.TreeMap;
import java.util.EnumMap;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
// --------------------------------------------------------------------------------------------------------------
//
// Static methods to manipulate machine platforms
//
// --------------------------------------------------------------------------------------------------------------
public enum SequencerPlatform {
SOLEXA, // Solexa / Illumina
ROCHE454, // 454
SOLID, // SOLiD
CG, // Complete Genomics
UNKNOWN // No idea -- defaulting to 1/3
}
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
private static void bind(String s, SequencerPlatform x) {
PLFieldToSequencerPlatform.put(s, x);
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
}
//
// Static list of platforms supported by this system.
//
static {
bind("LS454", SequencerPlatform.ROCHE454);
bind("454", SequencerPlatform.ROCHE454);
bind("ILLUMINA", SequencerPlatform.SOLEXA);
bind("SOLEXA", SequencerPlatform.SOLEXA);
bind("SOLID", SequencerPlatform.SOLID);
bind("ABI_SOLID", SequencerPlatform.SOLID);
bind("CG", SequencerPlatform.CG);
}
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase();
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) {
return PLFieldToSequencerPlatform.get(lcSequencerString);
} else {
return SequencerPlatform.UNKNOWN;
}
}
private static ThreadLocal<SAMRecord> lastReadForPL = new ThreadLocal<SAMRecord>();
private static ThreadLocal<SequencerPlatform> plOfLastRead = new ThreadLocal<SequencerPlatform>();
public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) {
if ( lastReadForPL.get() != read ) {
lastReadForPL.set(read);
SAMReadGroupRecord readGroup = read.getReadGroup();
final String platformName = readGroup == null ? null : readGroup.getPlatform();
plOfLastRead.set(standardizeSequencerPlatform(platformName));
}
return plOfLastRead.get();
}
public int getReadSequencerPlatformIndex( SAMRecord read ) {
return getReadSequencerPlatform(read).ordinal();
}
// --------------------------------------------------------------------------------------------------------------
//
// Static methods to get at the transition tables themselves
//
// --------------------------------------------------------------------------------------------------------------
/**
* A matrix of value i x j -> log10(p) where
*
* i - byte of the miscalled base (i.e., A)
* j - byte of the presumed true base (i.e., C)
* log10p - empirical probability p that A is actually C
*
* The table is available for each technology
*/
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) {
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
log10pTrueGivenMiscall.put(pl, new double[4][4]);
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
misCallProbs[i][j] = log10(p);
}
private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) {
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
if ( logP == 0.0 )
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase));
else
return logP;
}
private static void addSolexa() {
SequencerPlatform pl = SequencerPlatform.SOLEXA;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0);
}
private static void addSOLiD() {
SequencerPlatform pl = SequencerPlatform.SOLID;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0);
}
private static void add454() {
SequencerPlatform pl = SequencerPlatform.ROCHE454;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0);
}
private static void addCG() {
SequencerPlatform pl = SequencerPlatform.CG;
addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0);
addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0);
addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0);
addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0);
addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0);
}
private static void addUnknown() {
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
for ( byte b1 : BaseUtils.BASES ) {
for ( byte b2 : BaseUtils.BASES ) {
if ( b1 != b2 )
addMisCall(pl, b1, b2, 1.0/3.0);
}
}
}
static {
addSolexa();
add454();
addSOLiD();
addCG();
addUnknown();
}
// --------------------------------------------------------------------------------------------------------------
//
// The actual objects themselves
//
// --------------------------------------------------------------------------------------------------------------
private boolean raiseErrorOnUnknownPlatform = true;
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
//
// forwarding constructors -- don't do anything at all
//
public EmpiricalSubstitutionProbabilities() { super(); }
public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) {
super();
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
}
public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) {
super();
if ( assumeUnknownPlatformsAreThis != null ) {
raiseErrorOnUnknownPlatform = false;
defaultPlatform = assumeUnknownPlatformsAreThis;
}
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// calculation of p(B|GT)
//
//
// -----------------------------------------------------------------------------------------------------------------
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
SequencerPlatform pl = getReadSequencerPlatform(read);
if ( pl == SequencerPlatform.UNKNOWN ) {
if ( raiseErrorOnUnknownPlatform )
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
else {
pl = defaultPlatform;
//System.out.printf("Using default platform %s", pl);
}
}
//System.out.printf("%s for %s%n", pl, read);
double log10p;
if ( fwdStrand ) {
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
} else {
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
}
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
return log10p;
}
}

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import java.util.*;
import java.io.PrintStream;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
super(N, logger, verboseWriter);
}
public void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
// TODO: implement me based on
}
}

View File

@ -0,0 +1,373 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods)
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(b | D) = P(b) * P(D | b)
*
* where
*
* P(D | b) = sum_i log10 P(bi | b)
*
* and
*
* P(bi | b) = 1 - P(error | q1) if bi = b
* = P(error | q1) / 3 if bi != b
*
*
*/
public abstract class FourBaseLikelihoods implements Cloneable {
protected boolean enableCacheFlag = true;
//
// The fundamental data array associated with 4-base likelihoods
//
protected double[] log10Likelihoods = null;
/**
* If true, lots of output will be generated about the Likelihoods at each site
*/
private boolean verbose = false;
/**
* Bases with Q scores below this threshold aren't included in the Likelihood calculation
*/
private int minQScoreToInclude = 0;
/**
* Create a new FourBaseLikelihoods object
*/
public FourBaseLikelihoods() {
log10Likelihoods = zeros.clone(); // Likelihoods are all zeros
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
FourBaseLikelihoods c = (FourBaseLikelihoods)super.clone();
c.log10Likelihoods = log10Likelihoods.clone();
return c;
}
public void setVerbose(boolean v) {
verbose = v;
}
public boolean isVerbose() {
return verbose;
}
public int getMinQScoreToInclude() {
return minQScoreToInclude;
}
public void setMinQScoreToInclude(int minQScoreToInclude) {
this.minQScoreToInclude = minQScoreToInclude;
}
/**
* Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
* @return probs
*/
public double[] getLog10Likelihoods() {
return log10Likelihoods;
}
/**
* Returns the likelihood associated with a base
* @param base base
* @return log10 likelihood as a double
*/
public double getLog10Likelihood(byte base) {
return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base));
}
public double getLog10Likelihood(int baseIndex) {
return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]);
}
/**
* Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
* @return probs
*/
public double[] getLikelihoods() {
double[] probs = new double[4];
for (int i = 0; i < 4; i++)
probs[i] = Math.pow(10, log10Likelihoods[i]);
return probs;
}
/**
* Returns the likelihoods associated with a base
* @param base base
* @return likelihoods as a double
*/
public double getLikelihood(byte base) {
return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base));
}
public double getLikelihood(int baseIndex) {
return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex]));
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// add() -- the heart of
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase observed base
* @param qualityScore base quality
* @param read SAM read
* @param offset offset on read
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
*/
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
FourBaseLikelihoods fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset);
if ( fbl == null )
return 0;
for ( byte base : BaseUtils.BASES ) {
double likelihood = fbl.getLikelihood(base);
log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood;
}
if ( verbose ) {
for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); }
System.out.println();
for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); }
System.out.println();
}
return 1;
}
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase observed base
* @param qualityScore base quality
* @param read SAM read
* @param offset offset on read
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
*/
public FourBaseLikelihoods computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
if ( badBase(observedBase) ) {
return null;
}
try {
if ( qualityScore > getMinQScoreToInclude() ) {
FourBaseLikelihoods fbl = (FourBaseLikelihoods)this.clone();
fbl.log10Likelihoods = zeros.clone();
for ( byte base : BaseUtils.BASES ) {
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset);
if ( verbose ) {
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
}
fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
}
return fbl;
}
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
return null;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
* is considered bad if:
*
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
*
* @param observedBase observed base
* @return true if the base is a bad base
*/
private boolean badBase(byte observedBase) {
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
}
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for ( byte base : BaseUtils.BASES ) {
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex]));
sum += Math.pow(10, log10Likelihoods[baseIndex]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overloads this
public int getReadSequencerPlatformIndex( SAMRecord read ) {
return 0;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
for ( byte base : BaseUtils.BASES ) {
int i = BaseUtils.simpleBaseToBaseIndex(base);
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
throw new IllegalStateException(String.format("At %s: %s", base, bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Hearty math calculations follow
//
// -- these should not be messed with unless you know what you are doing
//
// -----------------------------------------------------------------------------------------------------------------
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) {
if (qual == 0) { // zero quals are wrong
throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s",
observedBase, chromBase, qual, offset, read));
}
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
/**
* Must be overridden by concrete subclasses
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset);
//
// Constant static data
//
private final static double[] zeros = new double[BaseUtils.BASES.length];
static {
for ( byte base : BaseUtils.BASES ) {
zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
}
}
}

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*;
public class FourBaseLikelihoodsFactory {
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
public static BaseMismatchModel getBaseMismatchModel(final String name) {
return valueOf(name);
}
public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) {
if ( m instanceof OneStateErrorProbabilities)
return ONE_STATE;
else if ( m instanceof ThreeStateErrorProbabilities)
return THREE_STATE;
else if ( m instanceof EmpiricalSubstitutionProbabilities)
return EMPIRICAL;
throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass());
}
/**
* General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models
*
* @param m model
* @param pl default platform
* @return new 4-base model
*/
public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m,
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
switch ( m ) {
case ONE_STATE: return new OneStateErrorProbabilities();
case THREE_STATE: return new ThreeStateErrorProbabilities();
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
}
}
}

View File

@ -0,0 +1,77 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broad.tribble.util.variantcontext.Allele;
import java.util.Map;
/**
* The model representing how we calculate genotype likelihoods
*/
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
public enum Model {
SNP,
DINDEL
}
protected UnifiedArgumentCollection UAC;
protected Logger logger;
/**
* Create a new object
* @param logger logger
* @param UAC unified arg collection
*/
protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
this.UAC = UAC.clone();
this.logger = logger;
}
/**
* Must be overridden by concrete subclasses
* @param tracker rod data
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param GLs hash of sample->GL to fill in
*
* @return genotype likelihoods per sample for AA, AB, BB
*/
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, StratifiedAlignmentContext> contexts,
StratifiedAlignmentContext.StratifiedContextType contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs);
}

View File

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
public interface GenotypePriors {
public double[] getPriors();
public double getHeterozygosity();
public boolean validate(boolean throwException);
}

View File

@ -0,0 +1,97 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import java.util.*;
import java.io.PrintStream;
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
// for use in optimizing the P(D|AF) calculations:
// how much off from the max likelihoods do we need to be before we can quit calculating?
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
super(N, logger, verboseWriter);
}
public void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, BiallelicGenotypeLikelihoods> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
initializeAFMatrix(GLs);
// first, calculate for AF=0 (no change to matrix)
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
// TODO: get rid of this optimization, it is wrong!
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
// for each minor allele frequency, calculate log10PofDgivenAFi
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
// add one more alternate allele
AFMatrix.incrementFrequency();
// calculate new likelihoods
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
// an optimization to speed up the calculation: if we are beyond the local maximum such
// that subsequent likelihoods won't factor into the confidence score, just quit
if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) {
UnifiedGenotyperEngine.ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors, i);
return;
}
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
}
}
/**
* Overrides the super class
* @param contexts alignment contexts
* @param GLs genotype likelihoods
* @param log10AlleleFrequencyPosteriors allele frequency results
*
* @return calls
*/
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
Map<String, BiallelicGenotypeLikelihoods> GLs,
double[] log10AlleleFrequencyPosteriors,
int AFofMaxLikelihood) {
return generateCalls(contexts, GLs, AFofMaxLikelihood);
}
}

View File

@ -0,0 +1,60 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import net.sf.samtools.SAMRecord;
/**
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
*/
public class OneStateErrorProbabilities extends FourBaseLikelihoods {
//
// forwarding constructors -- don't do anything at all
//
public OneStateErrorProbabilities() { super(); }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
return 0; // equivalent to e model
}
}

View File

@ -0,0 +1,138 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broad.tribble.util.variantcontext.Allele;
import org.apache.log4j.Logger;
import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
// the alternate allele with the largest sum of quality scores
protected Byte bestAlternateAllele = null;
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
}
public Allele getLikelihoods(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, StratifiedAlignmentContext> contexts,
StratifiedAlignmentContext.StratifiedContextType contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs) {
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
byte refBase = ref.getBase();
Allele refAllele = Allele.create(refBase, true);
// find the alternate allele with the largest sum of quality scores
if ( contextType == StratifiedAlignmentContext.StratifiedContextType.COMPLETE )
initializeBestAlternateAllele(refBase, contexts);
// if there are no non-ref bases...
if ( bestAlternateAllele == null ) {
// did we trigger on the provided track?
boolean atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0;
// if we don't want all bases, then we don't need to calculate genotype likelihoods
if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE )
return refAllele;
// otherwise, choose any alternate allele (it doesn't really matter)
bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C');
}
Allele altAllele = Allele.create(bestAlternateAllele, false);
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
// create the GenotypeLikelihoods object
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform);
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
double[] likelihoods = GL.getLikelihoods();
double[] posteriors = GL.getPosteriors();
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele);
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
refAllele,
altAllele,
likelihoods[refGenotype.ordinal()],
likelihoods[hetGenotype.ordinal()],
likelihoods[homGenotype.ordinal()],
posteriors[refGenotype.ordinal()],
posteriors[hetGenotype.ordinal()],
posteriors[homGenotype.ordinal()]));
}
return refAllele;
}
protected void initializeBestAlternateAllele(byte ref, Map<String, StratifiedAlignmentContext> contexts) {
int[] qualCounts = new int[4];
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
// calculate the sum of quality scores for each base
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
for ( PileupElement p : pileup ) {
// ignore deletions
if ( p.isDeletion() )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
if ( index >= 0 )
qualCounts[index] += p.getQual();
}
}
// set the non-ref base with maximum quality score sum
int maxCount = 0;
bestAlternateAllele = null;
for ( byte altAllele : BaseUtils.BASES ) {
if ( altAllele == ref )
continue;
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
if ( qualCounts[index] > maxCount ) {
maxCount = qualCounts[index];
bestAlternateAllele = altAllele;
}
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import static java.lang.Math.log10;
import net.sf.samtools.SAMRecord;
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
//
// forwarding constructors -- don't do anything at all
//
public ThreeStateErrorProbabilities() { super(); }
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
return super.clone();
}
/**
* Simple log10(3) cached value
*/
protected static final double log103 = log10(3.0);
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param read SAM read
* @param offset offset on read
* @return log10 likelihood
*/
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
return -log103; // equivalent to e / 3 model
}
}

View File

@ -0,0 +1,125 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
// control the output
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public boolean GENOTYPE_MODE = false;
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
public boolean ALL_BASES_MODE = false;
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
public boolean NO_SLOD = false;
// control the error modes
@Hidden
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
public String ASSUME_SINGLE_SAMPLE = null;
@Hidden
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null;
// control the various parameters to be used
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 10;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
public int MIN_MAPPING_QUALTY_SCORE = 10;
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
public int MAX_MISMATCHES = 3;
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
public boolean USE_BADLY_MATED_READS = false;
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
public boolean CAP_BASE_QUALITY = false;
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.GLmodel = GLmodel;
uac.baseModel = baseModel;
uac.heterozygosity = heterozygosity;
uac.GENOTYPE_MODE = GENOTYPE_MODE;
uac.ALL_BASES_MODE = ALL_BASES_MODE;
uac.NO_SLOD = NO_SLOD;
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
uac.defaultPlatform = defaultPlatform;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING;
uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE;
uac.MAX_MISMATCHES = MAX_MISMATCHES;
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
return uac;
}
}

View File

@ -0,0 +1,477 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.util.variantcontext.Genotype;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broad.tribble.vcf.VCFConstants;
import org.broad.tribble.dbsnp.DbSNPFeature;
import java.io.PrintStream;
import java.util.*;
public class UnifiedGenotyperEngine {
public static final String TRIGGER_TRACK_NAME = "trigger";
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
// the unified argument collection
private UnifiedArgumentCollection UAC = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// the model used for calculating genotypes
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private double[] log10AlleleFrequencyPriors;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
// the priors object
private GenotypePriors genotypePriors;
// the various loggers and writers
private Logger logger = null;
private PrintStream verboseWriter = null;
// number of chromosomes (2 * samples) in input
private int N;
// the standard filter to use for calls below the confidence threshold but above the emit threshold
private static final Set<String> filter = new HashSet<String>(1);
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
// get the number of samples
// if we're supposed to assume a single sample, do so
int numSamples;
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
numSamples = 1;
else
numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
initialize(UAC, null, null, null, numSamples);
}
public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
initialize(UAC, logger, verboseWriter, engine, numSamples);
}
private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
this.UAC = UAC;
this.logger = logger;
this.verboseWriter = verboseWriter;
this.annotationEngine = engine;
N = 2 * numSamples;
log10AlleleFrequencyPriors = new double[N+1];
computeAlleleFrequencyPriors(N);
genotypePriors = createGenotypePriors(UAC);
filter.add(LOW_QUAL_FILTER_NAME);
}
/**
* Compute at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
if ( glcm.get() == null ) {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
log10AlleleFrequencyPosteriors.set(new double[N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter);
if ( stratifiedContexts == null )
return null;
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, genotypePriors, GLs);
// estimate our confidence in a reference call and return
if ( GLs.size() == 0 )
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false);
// reset the optimization value and determine the p(AF>0)
// TODO: get rid of this optimization, it is wrong!
afcm.get().setMinAlleleFrequencyToTest(0);
// zero out the AFs above the N for this position
ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors.get(), 2 * GLs.size());
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += log10AlleleFrequencyPosteriors.get()[i];
}
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// did we trigger on the provided track?
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true);
}
// create the genotypes
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(stratifiedContexts, GLs, log10AlleleFrequencyPosteriors.get(), bestAFguess);
// next, get the variant context data (alleles, attributes, etc.)
HashSet<Allele> alleles = new HashSet<Allele>();
alleles.add(refAllele);
for ( Genotype g : genotypes.values() )
alleles.addAll(g.getAlleles());
// *** note that calculating strand bias involves overwriting data structures, so we do that last
HashMap<String, Object> attributes = new HashMap<String, Object>();
DbSNPFeature dbsnp = getDbSNP(tracker);
if ( dbsnp != null )
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
// if the site was downsampled, record that fact
if ( rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( !UAC.NO_SLOD ) {
// TODO: implement me
//Map<String, BiallelicGenotypeLikelihoods> forwardGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD, genotypePriors);
//Map<String, BiallelicGenotypeLikelihoods> reverseGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE, genotypePriors);
}
GenomeLoc loc = refContext.getLocus();
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
if ( annotationEngine != null ) {
// first off, we want to use the *unfiltered* context for the annotations
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc);
vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
}
// print out stats if we have a writer
if ( verboseWriter != null )
printVerboseData(vc, PofF, phredScaledConfidence, normalizedPosteriors);
VariantCallContext call = new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
call.setRefBase(refContext.getBase());
return call;
}
private static boolean isValidDeletionFraction(double d) {
return ( d >= 0.0 && d <= 1.0 );
}
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
byte ref = refContext.getBase();
if ( !BaseUtils.isRegularBase(ref) )
return null;
ReadBackedPileup rawPileup = rawContext.getBasePileup();
// filter the context based on min base and mapping qualities
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on bad mates and mismatch rate
pileup = pileup.getFilteredPileup(badReadPileupFilter);
// don't call when there is no coverage
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
return null;
// are there too many deletions in the pileup?
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
}
return stratifiedContexts;
}
/**
* @param AFs AF array
* @param freqI allele frequency I
*/
protected static void ignoreAlleleFrequenciesAboveI(double[] AFs, int freqI) {
while ( ++freqI < AFs.length )
AFs[freqI] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
// TODO: implement me
double P_of_ref = 1.0;
// use the AF=0 prob if it's calculated
//if ( ignoreCoveredSamples )
// P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
// for each sample that we haven't examined yet
//for ( String sample : samples ) {
// boolean isCovered = contexts.containsKey(sample);
// if ( ignoreCoveredSamples && isCovered )
// continue;
P_of_ref = 0.5;
// int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
// P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
//}
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
}
protected void printVerboseData(VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
for (int i = 0; i <= N; i++) {
StringBuilder AFline = new StringBuilder("AFINFO\t");
AFline.append(vc.getChr()).append(":").append(vc.getStart()).append("\t");
AFline.append(vc.getReference()).append("\t");
if ( vc.isBiallelic() )
AFline.append(vc.getAlternateAllele(0)).append("\t");
else
AFline.append("N/A\t");
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
verboseWriter.println(AFline.toString());
}
verboseWriter.println("P(f>0) = " + PofF);
verboseWriter.println("Qscore = " + phredScaledConfidence);
verboseWriter.println();
}
/**
* Filters low quality reads out of the pileup.
*/
private class BadReadPileupFilter implements PileupElementFilter {
private ReferenceContext refContext;
public BadReadPileupFilter(ReferenceContext ref) {
// create the +/-20bp window
GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20));
byte[] bases = new byte[41];
System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size());
refContext = new ReferenceContext(ref.getLocus(), window, bases);
}
public boolean allow(PileupElement pileupElement) {
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
}
}
/**
* @param tracker rod data
*
* @return the dbsnp rod if there is one at this position
*/
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
}
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
}
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
return (atTriggerTrack ?
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
}
protected void computeAlleleFrequencyPriors(int N) {
// calculate sum(1/i)
double sigma_1_over_I = 0.0;
for (int i = 1; i <= N; i++)
sigma_1_over_I += 1.0 / (double)i;
// delta = theta / sum(1/i)
double delta = UAC.heterozygosity / sigma_1_over_I;
// calculate the null allele frequencies for 1-N
double sum = 0.0;
for (int i = 1; i <= N; i++) {
double value = delta / (double)i;
log10AlleleFrequencyPriors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
}
// TODO: enable me
protected void computeAlleleFrequencyPriorsCorrect(int N) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
for (int i = 1; i <= N; i++) {
double value = UAC.heterozygosity / (double)i;
log10AlleleFrequencyPriors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
}
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
GenotypePriors priors;
switch ( UAC.GLmodel ) {
case SNP:
// use flat priors for GLs
priors = new DiploidSNPGenotypePriors();
break;
case DINDEL:
// TODO: create indel priors object
priors = null;
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return priors;
}
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
GenotypeLikelihoodsCalculationModel glcm;
switch ( UAC.GLmodel ) {
case SNP:
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
case DINDEL:
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
AlleleFrequencyCalculationModel afcm;
switch ( UAC.AFmodel ) {
case EXACT:
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
break;
case GRID_SEARCH:
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
break;
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
}
return afcm;
}
}

View File

@ -0,0 +1,241 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.util.*;
import java.io.PrintStream;
/**
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
* multi-sample data. The user can choose from several different incorporated calculation models.
*/
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
// control the output
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
@Deprecated
protected String varout;
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
protected PrintStream metricsWriter = null;
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>();
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { "Standard" };
// the calculation arguments
private UnifiedGenotyperEngine UG_engine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
// samples in input
private Set<String> samples = new TreeSet<String>();
// enable deletions in the pileup
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
/**
* Inner class for collecting output statistics from the UG
*/
public static class UGStatistics {
/** The total number of passes examined -- i.e., the number of map calls */
long nBasesVisited = 0;
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
long nBasesCallable = 0;
/** The number of bases called confidently (according to user threshold), either ref or other */
long nBasesCalledConfidently = 0;
/** The number of bases for which calls were emitted */
long nCallsMade = 0;
/** The total number of extended events encountered */
long nExtendedEvents = 0;
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
}
/**
* Initialize the samples, output, and genotype calculation model
*
**/
public void initialize() {
// get all of the unique sample names
// if we're supposed to assume a single sample, do so
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
else
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
UG_engine = new UnifiedGenotyperEngine(UAC, logger, verboseWriter, annotationEngine, samples.size());
// initialize the header
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
}
private Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
if ( !UAC.NO_SLOD )
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
// also, check to see whether comp rods were included
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
}
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
}
}
// FORMAT and INFO fields
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
// FILTER fields
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
return headerInfo;
}
/**
* Compute at a given locus.
*
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
return UG_engine.runGenotyper(tracker, refContext, rawContext);
}
public UGStatistics reduceInit() { return new UGStatistics(); }
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
lhs.nBasesCallable += rhs.nBasesCallable;
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
lhs.nBasesVisited += rhs.nBasesVisited;
lhs.nCallsMade += rhs.nCallsMade;
return lhs;
}
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
// we get a point for reaching reduce
sum.nBasesVisited++;
// can't call the locus because of no coverage
if ( value == null )
return sum;
// A call was attempted -- the base was potentially callable
sum.nBasesCallable++;
// the base was confidently callable
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
// can't make a confident variant call here
if ( value.vc == null )
return sum;
try {
// we are actually making a call
sum.nCallsMade++;
writer.add(value.vc, value.refBase);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
}
return sum;
}
public void onTraversalDone(UGStatistics sum) {
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
logger.info(String.format("Actual calls made %d", sum.nCallsMade));
if ( metricsWriter != null ) {
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));
metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable));
metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade));
}
}
}

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.VariantContext;
/**
* Created by IntelliJ IDEA.
* User: depristo, ebanks
* Date: Jan 22, 2010
* Time: 2:25:19 PM
*
* Useful helper class to communicate the results of calculateGenotype to framework
*/
public class VariantCallContext {
public VariantContext vc = null;
public byte refBase;
// Was the site called confidently, either reference or variant?
public boolean confidentlyCalled = false;
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
this.vc = vc;
this.confidentlyCalled = confidentlyCalledP;
}
VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) {
this.vc = vc;
this.refBase = ref;
this.confidentlyCalled = confidentlyCalledP;
}
// blank variant context => we're a ref site
VariantCallContext(boolean confidentlyCalledP) {
this.confidentlyCalled = confidentlyCalledP;
}
public void setRefBase(byte ref) {
this.refBase = ref;
}
}