Initial version of refactored Unified Genotyper. Using SNP genotype likelihoods and GRID_SEARCH AF estimation models, achieves the exact same results as original UG on 1-2 samples with the exception of strand bias (not implemented yet); other than that I have no idea. Needs tons more testing. Do not use. For Guillermo only.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4377 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6df7f9318f
commit
0ec07ad99a
|
|
@ -0,0 +1,355 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate a genotype given the priors and a pile
|
||||
* of bases and quality scores
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
EXACT,
|
||||
GRID_SEARCH
|
||||
}
|
||||
|
||||
protected int N;
|
||||
protected AlleleFrequencyMatrix AFMatrix;
|
||||
protected Set<BiallelicGenotypeLikelihoods> refCalls;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
private int minAlleleFrequencyToTest;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||
refCalls = new HashSet<BiallelicGenotypeLikelihoods>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||
*/
|
||||
public abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors);
|
||||
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param contexts alignment contexts
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// increment the grid
|
||||
for (int i = 1; i <= AFofMaxLikelihood; i++) {
|
||||
// add one more alternate allele
|
||||
AFMatrix.incrementFrequency();
|
||||
}
|
||||
|
||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||
}
|
||||
|
||||
// TODO: get rid of this optimization, it is wrong!
|
||||
protected int getMinAlleleFrequencyToTest() {
|
||||
return minAlleleFrequencyToTest;
|
||||
}
|
||||
|
||||
protected void setMinAlleleFrequencyToTest(int minAF) {
|
||||
minAlleleFrequencyToTest = minAF;
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
int frequency) {
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// first, the potential alt calls
|
||||
for ( String sample : AFMatrix.getSamples() ) {
|
||||
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||
Allele alleleA = GL.getAlleleA();
|
||||
Allele alleleB = GL.getAlleleB();
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(frequency, sample);
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleA);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleB);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||
myAlleles.add(alleleB);
|
||||
myAlleles.add(alleleB);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||
}
|
||||
|
||||
// now, the clearly ref calls
|
||||
for ( BiallelicGenotypeLikelihoods GL : refCalls ) {
|
||||
String sample = GL.getSample();
|
||||
|
||||
Allele ref = GL.getAlleleA();
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(ref);
|
||||
myAlleles.add(ref);
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
|
||||
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
|
||||
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
protected void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
refCalls.clear();
|
||||
AFMatrix.clear();
|
||||
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( isClearRefCall(GL) ) {
|
||||
refCalls.add(GL);
|
||||
} else {
|
||||
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) {
|
||||
if ( GL.getAlleleA().isNonReference() )
|
||||
return false;
|
||||
|
||||
double[] likelihoods = GL.getLikelihoods();
|
||||
return ( likelihoods[0] > likelihoods[1] && likelihoods[0] > likelihoods[2]);
|
||||
}
|
||||
|
||||
protected class CalculatedAlleleFrequency {
|
||||
|
||||
public double log10PNonRef;
|
||||
public int altAlleles;
|
||||
|
||||
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
|
||||
this.log10PNonRef = log10PNonRef;
|
||||
this.altAlleles = altAlleles;
|
||||
}
|
||||
}
|
||||
|
||||
private enum GenotypeType { AA, AB, BB }
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int maxN; // total possible frequencies in data
|
||||
private int frequency; // current frequency
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
maxN = N;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
clear();
|
||||
}
|
||||
|
||||
public List<String> getSamples() { return samples; }
|
||||
|
||||
public void clear() {
|
||||
frequency = 0;
|
||||
for (int i = 0; i < maxN; i++)
|
||||
indexes[i] = 0;
|
||||
samples.clear();
|
||||
samplesToGenotypesPerAF.clear();
|
||||
}
|
||||
|
||||
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
||||
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
||||
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
||||
}
|
||||
|
||||
public void setLikelihoods(double[] GLs, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
int N = samples.size();
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
int N = samples.size();
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.AA.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
public enum BaseMismatchModel {
|
||||
ONE_STATE,
|
||||
THREE_STATE,
|
||||
EMPIRICAL
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
||||
public class BiallelicGenotypeLikelihoods {
|
||||
|
||||
private String sample;
|
||||
private double[] GLs;
|
||||
private double[] GPs;
|
||||
private Allele A, B;
|
||||
|
||||
/**
|
||||
* Create a new object for sample with given alleles and genotype likelihoods
|
||||
*
|
||||
* @param sample sample name
|
||||
* @param A allele A
|
||||
* @param B allele B
|
||||
* @param log10AALikelihoods AA likelihoods
|
||||
* @param log10ABLikelihoods AB likelihoods
|
||||
* @param log10BBLikelihoods BB likelihoods
|
||||
*/
|
||||
public BiallelicGenotypeLikelihoods(String sample,
|
||||
Allele A,
|
||||
Allele B,
|
||||
double log10AALikelihoods,
|
||||
double log10ABLikelihoods,
|
||||
double log10BBLikelihoods) {
|
||||
this.sample = sample;
|
||||
this.A = A;
|
||||
this.B = B;
|
||||
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||
this.GPs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new object for sample with given alleles and genotype likelihoods & posteriors
|
||||
*
|
||||
* @param sample sample name
|
||||
* @param A allele A
|
||||
* @param B allele B
|
||||
* @param log10AALikelihoods AA likelihoods
|
||||
* @param log10ABLikelihoods AB likelihoods
|
||||
* @param log10BBLikelihoods BB likelihoods
|
||||
* @param log10AAPosteriors AA posteriors
|
||||
* @param log10ABPosteriors AB posteriors
|
||||
* @param log10BBPosteriors BB posteriors
|
||||
*/
|
||||
public BiallelicGenotypeLikelihoods(String sample,
|
||||
Allele A,
|
||||
Allele B,
|
||||
double log10AALikelihoods,
|
||||
double log10ABLikelihoods,
|
||||
double log10BBLikelihoods,
|
||||
double log10AAPosteriors,
|
||||
double log10ABPosteriors,
|
||||
double log10BBPosteriors) {
|
||||
this.sample = sample;
|
||||
this.A = A;
|
||||
this.B = B;
|
||||
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||
this.GPs = new double[]{log10AAPosteriors, log10ABPosteriors, log10BBPosteriors};
|
||||
}
|
||||
|
||||
public String getSample() {
|
||||
return sample;
|
||||
}
|
||||
|
||||
public double getAALikelihoods() {
|
||||
return GLs[0];
|
||||
}
|
||||
|
||||
public double getABLikelihoods() {
|
||||
return GLs[1];
|
||||
}
|
||||
|
||||
public double getBBLikelihoods() {
|
||||
return GLs[2];
|
||||
}
|
||||
|
||||
public double[] getLikelihoods() {
|
||||
return GLs;
|
||||
}
|
||||
|
||||
public double getAAPosteriors() {
|
||||
return GPs[0];
|
||||
}
|
||||
|
||||
public double getABPosteriors() {
|
||||
return GPs[1];
|
||||
}
|
||||
|
||||
public double getBBPosteriors() {
|
||||
return GPs[2];
|
||||
}
|
||||
|
||||
public double[] getPosteriors() {
|
||||
return GPs;
|
||||
}
|
||||
|
||||
public Allele getAlleleA() {
|
||||
return A;
|
||||
}
|
||||
|
||||
public Allele getAlleleB() {
|
||||
return B;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
}
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, StratifiedAlignmentContext> contexts,
|
||||
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
// TODO: check to make sure the priors instanceof a valid priors class
|
||||
|
||||
// TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
// TODO: fill me in
|
||||
|
||||
//GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...));
|
||||
}
|
||||
|
||||
// TODO: return the reference Allele
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,534 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(G | D) = P(G) * P(D | G)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | G) = sum_i log10 P(bi | G)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for homozygous genotypes and for heterozygous genotypes:
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
|
||||
*
|
||||
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
|
||||
*
|
||||
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
|
||||
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
|
||||
* model.
|
||||
*/
|
||||
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
||||
protected final static int FIXED_PLOIDY = 2;
|
||||
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
|
||||
|
||||
protected boolean enableCacheFlag = true;
|
||||
protected boolean VERBOSE = false;
|
||||
|
||||
//
|
||||
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] log10Likelihoods = null;
|
||||
protected double[] log10Posteriors = null;
|
||||
|
||||
protected DiploidSNPGenotypePriors priors = null;
|
||||
|
||||
protected FourBaseLikelihoods fourBaseLikelihoods = null;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m) {
|
||||
this.priors = new DiploidSNPGenotypePriors();
|
||||
initialize(m, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param pl default platform
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
this.priors = new DiploidSNPGenotypePriors();
|
||||
initialize(m, pl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param priors priors
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors) {
|
||||
this.priors = priors;
|
||||
initialize(m, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||
*
|
||||
* @param m base model
|
||||
* @param priors priors
|
||||
* @param pl default platform
|
||||
*/
|
||||
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
this.priors = priors;
|
||||
initialize(m, pl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone();
|
||||
c.priors = priors;
|
||||
c.log10Likelihoods = log10Likelihoods.clone();
|
||||
c.log10Posteriors = log10Posteriors.clone();
|
||||
c.fourBaseLikelihoods = (FourBaseLikelihoods)fourBaseLikelihoods.clone();
|
||||
return c;
|
||||
}
|
||||
|
||||
protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||
fourBaseLikelihoods = FourBaseLikelihoodsFactory.makeFourBaseLikelihoods(m, pl);
|
||||
setToZero();
|
||||
}
|
||||
|
||||
protected void setToZero() {
|
||||
log10Likelihoods = zeros.clone(); // likelihoods are all zeros
|
||||
log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors
|
||||
}
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
VERBOSE = v;
|
||||
fourBaseLikelihoods.setVerbose(v);
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return VERBOSE;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return fourBaseLikelihoods.getMinQScoreToInclude();
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||
* @return likelihoods array
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with DiploidGenotype g
|
||||
* @param g genotype
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLikelihood(DiploidGenotype g) {
|
||||
return getLikelihoods()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return raw log10 (not-normalized posteriors) as a double array
|
||||
*/
|
||||
public double[] getPosteriors() {
|
||||
return log10Posteriors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the posterior associated with DiploidGenotype g
|
||||
* @param g genotpe
|
||||
* @return raw log10 (not-normalized posteror) as a double
|
||||
*/
|
||||
public double getPosterior(DiploidGenotype g) {
|
||||
return getPosteriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return normalized posterors as a double array
|
||||
*/
|
||||
public double[] getNormalizedPosteriors() {
|
||||
double[] normalized = new double[log10Posteriors.length];
|
||||
double sum = 0.0;
|
||||
|
||||
// for precision purposes, we need to add (or really subtract, since everything is negative)
|
||||
// the largest posterior value from all entries so that numbers don't get too small
|
||||
double maxValue = log10Posteriors[0];
|
||||
for (int i = 1; i < log10Posteriors.length; i++) {
|
||||
if ( maxValue < log10Posteriors[i] )
|
||||
maxValue = log10Posteriors[i];
|
||||
}
|
||||
|
||||
// collect the posteriors
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double posterior = Math.pow(10, getPosterior(g) - maxValue);
|
||||
normalized[g.ordinal()] = posterior;
|
||||
sum += posterior;
|
||||
}
|
||||
|
||||
// normalize
|
||||
for (int i = 0; i < normalized.length; i++)
|
||||
normalized[i] /= sum;
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public DiploidSNPGenotypePriors getPriorObject() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors.getPriors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the priors
|
||||
* @param priors priors
|
||||
*/
|
||||
public void setPriors(DiploidSNPGenotypePriors priors) {
|
||||
this.priors = priors;
|
||||
log10Posteriors = zeros.clone();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int i = g.ordinal();
|
||||
log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g genotype
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple function to overload to control the caching of genotype likelihood outputs.
|
||||
* By default the function trues true -- do enable caching. If you are experimenting with an
|
||||
* complex calcluation of P(B | G) and caching might not work correctly for you, overload this
|
||||
* function and return false, so the super() object won't try to cache your GL calculations.
|
||||
*
|
||||
* @return true if caching should be enabled, false otherwise
|
||||
*/
|
||||
public boolean cacheIsEnabled() {
|
||||
return enableCacheFlag;
|
||||
}
|
||||
|
||||
public void setEnableCacheFlag(boolean enable) {
|
||||
enableCacheFlag = enable;
|
||||
}
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||
int n = 0;
|
||||
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() )
|
||||
continue;
|
||||
|
||||
byte base = p.getBase();
|
||||
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
|
||||
n += add(base, qual, p.getRead(), p.getOffset());
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
|
||||
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||
DiploidSNPGenotypeLikelihoods gl;
|
||||
if ( cacheIsEnabled() ) {
|
||||
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
|
||||
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
|
||||
} else {
|
||||
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
|
||||
}
|
||||
} else {
|
||||
gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||
}
|
||||
|
||||
// for bad bases, there are no likelihoods
|
||||
if ( gl == null )
|
||||
return 0;
|
||||
|
||||
double[] likelihoods = gl.getLikelihoods();
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihood = likelihoods[g.ordinal()];
|
||||
|
||||
if ( VERBOSE ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
log10Likelihoods[g.ordinal()] += likelihood;
|
||||
log10Posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static DiploidSNPGenotypeLikelihoods[][][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
|
||||
|
||||
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
|
||||
if ( gl == null )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
|
||||
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
|
||||
return gl;
|
||||
}
|
||||
|
||||
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, DiploidSNPGenotypeLikelihoods val ) {
|
||||
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
|
||||
cache[m][a][i][j][k][x] = val;
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
|
||||
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||
return cache[m][a][i][j][k][x];
|
||||
}
|
||||
|
||||
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
FourBaseLikelihoods fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||
if ( fbl == null )
|
||||
return null;
|
||||
|
||||
double[] fbLikelihoods = fbl.getLog10Likelihoods();
|
||||
try {
|
||||
|
||||
DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone();
|
||||
gl.setToZero();
|
||||
|
||||
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
|
||||
double ploidyAdjustment = log10(FIXED_PLOIDY);
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
|
||||
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
|
||||
double p_base = 0.0;
|
||||
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
|
||||
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
|
||||
double likelihood = log10(p_base);
|
||||
|
||||
gl.log10Likelihoods[g.ordinal()] += likelihood;
|
||||
gl.log10Posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
if ( VERBOSE ) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||
System.out.println();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
return gl;
|
||||
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static int strandIndex(boolean fwdStrand) {
|
||||
return fwdStrand ? 0 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
protected boolean badBase(byte observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (DiploidGenotype g : DiploidGenotype.values()) {
|
||||
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
|
||||
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
|
||||
}
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Validation routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
priors.validate(throwException);
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
String bad = null;
|
||||
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||
} else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) {
|
||||
bad = String.format("Posterior %f is badly formed", log10Posteriors[i]);
|
||||
}
|
||||
|
||||
if ( bad != null ) {
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
final static double[] zeros = new double[DiploidGenotype.values().length];
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
zeros[g.ordinal()] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class DiploidSNPGenotypePriors implements GenotypePriors {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Constants and static information
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
|
||||
public static final double CEU_HETEROZYGOSITY = 1e-3;
|
||||
public static final double YRI_HETEROZYGOSITY = 1.0 / 850;
|
||||
|
||||
/**
|
||||
* Default value of the prob of seeing a reference error. Used to calculation the
|
||||
* chance of seeing a true B/C het when the reference is A, which we assume is the product
|
||||
* of the ref error rate and the het. Default value is Q60
|
||||
*/
|
||||
public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is
|
||||
|
||||
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Diploid priors
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private double[] priors = null;
|
||||
|
||||
// todo -- fix me when this issue is resolved
|
||||
public static final boolean requirePriorSumToOne = false;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public DiploidSNPGenotypePriors() {
|
||||
priors = flatPriors.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
|
||||
* base ref
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozygosity
|
||||
* @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A
|
||||
*/
|
||||
public DiploidSNPGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) {
|
||||
priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
|
||||
*
|
||||
* @param log10Priors
|
||||
*/
|
||||
public DiploidSNPGenotypePriors(double[] log10Priors) {
|
||||
priors = log10Priors.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
public double getHeterozygosity() { return HUMAN_HETEROZYGOSITY; }
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
|
||||
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static functionality
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
|
||||
* value, as elements 0, 1, and 2 of a double[], respectively
|
||||
*
|
||||
* @param h the heterozygosity [probability of a base being heterozygous]
|
||||
*/
|
||||
@Deprecated
|
||||
public static double[] heterozygosity2DiploidProbabilities(double h) {
|
||||
double[] pdbls = new double[3];
|
||||
|
||||
pdbls[0] = heterozygosity2HomRefProbability(h);
|
||||
pdbls[1] = heterozygosity2HetProbability(h);
|
||||
pdbls[2] = heterozygosity2HomVarProbability(h);
|
||||
return pdbls;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param h
|
||||
* @return
|
||||
*/
|
||||
public static double heterozygosity2HomRefProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
double v = 1.0 - (3.0 * h / 2.0);
|
||||
if (MathUtils.isNegative(v)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HetProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HomVarProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h / 2.0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||
* appropriately.
|
||||
*
|
||||
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
|
||||
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
|
||||
* then this sets the priors to:
|
||||
*
|
||||
* AA = hom-ref
|
||||
* AC = AG = AT = (het - pTriStateGenotype) / 3
|
||||
* CC = GG = TT = hom-var / 3
|
||||
* CG = CT = GT = pTriStateGenotype / 3
|
||||
*
|
||||
* So that we get:
|
||||
*
|
||||
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
|
||||
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
|
||||
* hom-ref + het + hom-var
|
||||
* = 1
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozyosity
|
||||
* @param pRefError
|
||||
*/
|
||||
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
|
||||
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
|
||||
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
|
||||
}
|
||||
|
||||
double pTriStateGenotype = heterozyosity * pRefError;
|
||||
// if ( pTriStateGenotype >= heterozyosity ) {
|
||||
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
|
||||
// }
|
||||
|
||||
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
|
||||
double pHet = heterozygosity2HetProbability(heterozyosity);
|
||||
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
|
||||
|
||||
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
|
||||
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
|
||||
}
|
||||
|
||||
double[] priors = new double[DiploidGenotype.values().length];
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double POfG;
|
||||
|
||||
final double nOnRefHets = 3;
|
||||
final double nOffRefHets = 3;
|
||||
final double nHomVars = 3;
|
||||
|
||||
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
|
||||
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
|
||||
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
|
||||
else { POfG = pTriStateGenotype / nOffRefHets; }
|
||||
|
||||
priors[g.ordinal()] = Math.log10(POfG);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,301 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to manipulate machine platforms
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public enum SequencerPlatform {
|
||||
SOLEXA, // Solexa / Illumina
|
||||
ROCHE454, // 454
|
||||
SOLID, // SOLiD
|
||||
CG, // Complete Genomics
|
||||
UNKNOWN // No idea -- defaulting to 1/3
|
||||
}
|
||||
|
||||
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
|
||||
private static void bind(String s, SequencerPlatform x) {
|
||||
PLFieldToSequencerPlatform.put(s, x);
|
||||
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
|
||||
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
|
||||
}
|
||||
|
||||
//
|
||||
// Static list of platforms supported by this system.
|
||||
//
|
||||
static {
|
||||
bind("LS454", SequencerPlatform.ROCHE454);
|
||||
bind("454", SequencerPlatform.ROCHE454);
|
||||
bind("ILLUMINA", SequencerPlatform.SOLEXA);
|
||||
bind("SOLEXA", SequencerPlatform.SOLEXA);
|
||||
bind("SOLID", SequencerPlatform.SOLID);
|
||||
bind("ABI_SOLID", SequencerPlatform.SOLID);
|
||||
bind("CG", SequencerPlatform.CG);
|
||||
}
|
||||
|
||||
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
|
||||
String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase();
|
||||
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) {
|
||||
return PLFieldToSequencerPlatform.get(lcSequencerString);
|
||||
} else {
|
||||
return SequencerPlatform.UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
private static ThreadLocal<SAMRecord> lastReadForPL = new ThreadLocal<SAMRecord>();
|
||||
private static ThreadLocal<SequencerPlatform> plOfLastRead = new ThreadLocal<SequencerPlatform>();
|
||||
public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) {
|
||||
if ( lastReadForPL.get() != read ) {
|
||||
lastReadForPL.set(read);
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
final String platformName = readGroup == null ? null : readGroup.getPlatform();
|
||||
plOfLastRead.set(standardizeSequencerPlatform(platformName));
|
||||
}
|
||||
|
||||
return plOfLastRead.get();
|
||||
}
|
||||
|
||||
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||
return getReadSequencerPlatform(read).ordinal();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to get at the transition tables themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A matrix of value i x j -> log10(p) where
|
||||
*
|
||||
* i - byte of the miscalled base (i.e., A)
|
||||
* j - byte of the presumed true base (i.e., C)
|
||||
* log10p - empirical probability p that A is actually C
|
||||
*
|
||||
* The table is available for each technology
|
||||
*/
|
||||
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
|
||||
|
||||
private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) {
|
||||
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
|
||||
log10pTrueGivenMiscall.put(pl, new double[4][4]);
|
||||
|
||||
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
misCallProbs[i][j] = log10(p);
|
||||
}
|
||||
|
||||
private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) {
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||
|
||||
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
|
||||
if ( logP == 0.0 )
|
||||
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase));
|
||||
else
|
||||
return logP;
|
||||
}
|
||||
|
||||
private static void addSolexa() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0);
|
||||
}
|
||||
|
||||
private static void addSOLiD() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLID;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0);
|
||||
}
|
||||
|
||||
private static void add454() {
|
||||
SequencerPlatform pl = SequencerPlatform.ROCHE454;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0);
|
||||
}
|
||||
|
||||
private static void addCG() {
|
||||
SequencerPlatform pl = SequencerPlatform.CG;
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0);
|
||||
addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0);
|
||||
addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0);
|
||||
addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0);
|
||||
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0);
|
||||
addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0);
|
||||
}
|
||||
|
||||
private static void addUnknown() {
|
||||
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
|
||||
for ( byte b1 : BaseUtils.BASES ) {
|
||||
for ( byte b2 : BaseUtils.BASES ) {
|
||||
if ( b1 != b2 )
|
||||
addMisCall(pl, b1, b2, 1.0/3.0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
addSolexa();
|
||||
add454();
|
||||
addSOLiD();
|
||||
addCG();
|
||||
addUnknown();
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// The actual objects themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
private boolean raiseErrorOnUnknownPlatform = true;
|
||||
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
|
||||
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public EmpiricalSubstitutionProbabilities() { super(); }
|
||||
|
||||
public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) {
|
||||
super();
|
||||
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
|
||||
}
|
||||
|
||||
public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) {
|
||||
super();
|
||||
|
||||
if ( assumeUnknownPlatformsAreThis != null ) {
|
||||
raiseErrorOnUnknownPlatform = false;
|
||||
defaultPlatform = assumeUnknownPlatformsAreThis;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// calculation of p(B|GT)
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
SequencerPlatform pl = getReadSequencerPlatform(read);
|
||||
|
||||
if ( pl == SequencerPlatform.UNKNOWN ) {
|
||||
if ( raiseErrorOnUnknownPlatform )
|
||||
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
|
||||
else {
|
||||
pl = defaultPlatform;
|
||||
//System.out.printf("Using default platform %s", pl);
|
||||
}
|
||||
}
|
||||
|
||||
//System.out.printf("%s for %s%n", pl, read);
|
||||
|
||||
double log10p;
|
||||
if ( fwdStrand ) {
|
||||
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
|
||||
} else {
|
||||
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
|
||||
}
|
||||
|
||||
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
|
||||
|
||||
return log10p;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
|
||||
// TODO: implement me based on
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,373 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods)
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(b | D) = P(b) * P(D | b)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | b) = sum_i log10 P(bi | b)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | b) = 1 - P(error | q1) if bi = b
|
||||
* = P(error | q1) / 3 if bi != b
|
||||
*
|
||||
*
|
||||
*/
|
||||
public abstract class FourBaseLikelihoods implements Cloneable {
|
||||
|
||||
protected boolean enableCacheFlag = true;
|
||||
|
||||
//
|
||||
// The fundamental data array associated with 4-base likelihoods
|
||||
//
|
||||
protected double[] log10Likelihoods = null;
|
||||
|
||||
|
||||
/**
|
||||
* If true, lots of output will be generated about the Likelihoods at each site
|
||||
*/
|
||||
private boolean verbose = false;
|
||||
|
||||
/**
|
||||
* Bases with Q scores below this threshold aren't included in the Likelihood calculation
|
||||
*/
|
||||
private int minQScoreToInclude = 0;
|
||||
|
||||
/**
|
||||
* Create a new FourBaseLikelihoods object
|
||||
*/
|
||||
public FourBaseLikelihoods() {
|
||||
log10Likelihoods = zeros.clone(); // Likelihoods are all zeros
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
FourBaseLikelihoods c = (FourBaseLikelihoods)super.clone();
|
||||
c.log10Likelihoods = log10Likelihoods.clone();
|
||||
return c;
|
||||
}
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
verbose = v;
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return verbose;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return minQScoreToInclude;
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
this.minQScoreToInclude = minQScoreToInclude;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||
* @return probs
|
||||
*/
|
||||
public double[] getLog10Likelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with a base
|
||||
* @param base base
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLog10Likelihood(byte base) {
|
||||
return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||
}
|
||||
|
||||
public double getLog10Likelihood(int baseIndex) {
|
||||
return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||
* @return probs
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
double[] probs = new double[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
probs[i] = Math.pow(10, log10Likelihoods[i]);
|
||||
return probs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihoods associated with a base
|
||||
* @param base base
|
||||
* @return likelihoods as a double
|
||||
*/
|
||||
public double getLikelihood(byte base) {
|
||||
return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||
}
|
||||
|
||||
public double getLikelihood(int baseIndex) {
|
||||
return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex]));
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// add() -- the heart of
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param qualityScore base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||
*/
|
||||
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
FourBaseLikelihoods fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||
if ( fbl == null )
|
||||
return 0;
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
double likelihood = fbl.getLikelihood(base);
|
||||
log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood;
|
||||
}
|
||||
|
||||
if ( verbose ) {
|
||||
for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); }
|
||||
System.out.println();
|
||||
for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param qualityScore base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
|
||||
*/
|
||||
public FourBaseLikelihoods computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
if ( badBase(observedBase) ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
if ( qualityScore > getMinQScoreToInclude() ) {
|
||||
|
||||
FourBaseLikelihoods fbl = (FourBaseLikelihoods)this.clone();
|
||||
fbl.log10Likelihoods = zeros.clone();
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset);
|
||||
|
||||
if ( verbose ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
|
||||
}
|
||||
|
||||
return fbl;
|
||||
}
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
private boolean badBase(byte observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex]));
|
||||
sum += Math.pow(10, log10Likelihoods[baseIndex]);
|
||||
}
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overloads this
|
||||
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Validation routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(base);
|
||||
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||
String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||
throw new IllegalStateException(String.format("At %s: %s", base, bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Hearty math calculations follow
|
||||
//
|
||||
// -- these should not be messed with unless you know what you are doing
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param qual base quality
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
|
||||
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) {
|
||||
if (qual == 0) { // zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s",
|
||||
observedBase, chromBase, qual, offset, read));
|
||||
}
|
||||
|
||||
double logP;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
|
||||
}
|
||||
|
||||
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset);
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
private final static double[] zeros = new double[BaseUtils.BASES.length];
|
||||
|
||||
static {
|
||||
for ( byte base : BaseUtils.BASES ) {
|
||||
zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||
|
||||
public class FourBaseLikelihoodsFactory {
|
||||
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final String name) {
|
||||
return valueOf(name);
|
||||
}
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) {
|
||||
if ( m instanceof OneStateErrorProbabilities)
|
||||
return ONE_STATE;
|
||||
else if ( m instanceof ThreeStateErrorProbabilities)
|
||||
return THREE_STATE;
|
||||
else if ( m instanceof EmpiricalSubstitutionProbabilities)
|
||||
return EMPIRICAL;
|
||||
|
||||
throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass());
|
||||
|
||||
}
|
||||
/**
|
||||
* General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models
|
||||
*
|
||||
* @param m model
|
||||
* @param pl default platform
|
||||
* @return new 4-base model
|
||||
*/
|
||||
public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m,
|
||||
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
|
||||
switch ( m ) {
|
||||
case ONE_STATE: return new OneStateErrorProbabilities();
|
||||
case THREE_STATE: return new ThreeStateErrorProbabilities();
|
||||
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
|
||||
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate genotype likelihoods
|
||||
*/
|
||||
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
SNP,
|
||||
DINDEL
|
||||
}
|
||||
|
||||
protected UnifiedArgumentCollection UAC;
|
||||
protected Logger logger;
|
||||
|
||||
/**
|
||||
* Create a new object
|
||||
* @param logger logger
|
||||
* @param UAC unified arg collection
|
||||
*/
|
||||
protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
this.UAC = UAC.clone();
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param contexts stratified alignment contexts
|
||||
* @param contextType stratified context type
|
||||
* @param priors priors to use for GLs
|
||||
* @param GLs hash of sample->GL to fill in
|
||||
*
|
||||
* @return genotype likelihoods per sample for AA, AB, BB
|
||||
*/
|
||||
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, StratifiedAlignmentContext> contexts,
|
||||
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs);
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
public interface GenotypePriors {
|
||||
|
||||
public double[] getPriors();
|
||||
|
||||
public double getHeterozygosity();
|
||||
|
||||
public boolean validate(boolean throwException);
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||
|
||||
// for use in optimizing the P(D|AF) calculations:
|
||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||
|
||||
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
|
||||
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
|
||||
|
||||
// TODO: get rid of this optimization, it is wrong!
|
||||
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
|
||||
|
||||
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
|
||||
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
|
||||
// add one more alternate allele
|
||||
AFMatrix.incrementFrequency();
|
||||
|
||||
// calculate new likelihoods
|
||||
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
|
||||
|
||||
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
||||
// that subsequent likelihoods won't factor into the confidence score, just quit
|
||||
if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) {
|
||||
UnifiedGenotyperEngine.ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors, i);
|
||||
return;
|
||||
}
|
||||
|
||||
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
|
||||
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides the super class
|
||||
* @param contexts alignment contexts
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
|
||||
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
|
||||
*/
|
||||
public class OneStateErrorProbabilities extends FourBaseLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public OneStateErrorProbabilities() { super(); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
return 0; // equivalent to e model
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
// the alternate allele with the largest sum of quality scores
|
||||
protected Byte bestAlternateAllele = null;
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
}
|
||||
|
||||
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, StratifiedAlignmentContext> contexts,
|
||||
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||
GenotypePriors priors,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
|
||||
if ( !(priors instanceof DiploidSNPGenotypePriors) )
|
||||
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
|
||||
|
||||
byte refBase = ref.getBase();
|
||||
Allele refAllele = Allele.create(refBase, true);
|
||||
|
||||
// find the alternate allele with the largest sum of quality scores
|
||||
if ( contextType == StratifiedAlignmentContext.StratifiedContextType.COMPLETE )
|
||||
initializeBestAlternateAllele(refBase, contexts);
|
||||
|
||||
// if there are no non-ref bases...
|
||||
if ( bestAlternateAllele == null ) {
|
||||
// did we trigger on the provided track?
|
||||
boolean atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0;
|
||||
|
||||
// if we don't want all bases, then we don't need to calculate genotype likelihoods
|
||||
if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE )
|
||||
return refAllele;
|
||||
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C');
|
||||
}
|
||||
|
||||
Allele altAllele = Allele.create(bestAlternateAllele, false);
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform);
|
||||
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
|
||||
double[] likelihoods = GL.getLikelihoods();
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
|
||||
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
|
||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele);
|
||||
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||
refAllele,
|
||||
altAllele,
|
||||
likelihoods[refGenotype.ordinal()],
|
||||
likelihoods[hetGenotype.ordinal()],
|
||||
likelihoods[homGenotype.ordinal()],
|
||||
posteriors[refGenotype.ordinal()],
|
||||
posteriors[hetGenotype.ordinal()],
|
||||
posteriors[homGenotype.ordinal()]));
|
||||
}
|
||||
|
||||
return refAllele;
|
||||
}
|
||||
|
||||
protected void initializeBestAlternateAllele(byte ref, Map<String, StratifiedAlignmentContext> contexts) {
|
||||
int[] qualCounts = new int[4];
|
||||
|
||||
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||
// calculate the sum of quality scores for each base
|
||||
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
// ignore deletions
|
||||
if ( p.isDeletion() )
|
||||
continue;
|
||||
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
|
||||
if ( index >= 0 )
|
||||
qualCounts[index] += p.getQual();
|
||||
}
|
||||
}
|
||||
|
||||
// set the non-ref base with maximum quality score sum
|
||||
int maxCount = 0;
|
||||
bestAlternateAllele = null;
|
||||
for ( byte altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
if ( qualCounts[index] > maxCount ) {
|
||||
maxCount = qualCounts[index];
|
||||
bestAlternateAllele = altAllele;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public ThreeStateErrorProbabilities() { super(); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return clone
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple log10(3) cached value
|
||||
*/
|
||||
protected static final double log103 = log10(3.0);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param read SAM read
|
||||
* @param offset offset on read
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||
return -log103; // equivalent to e / 3 model
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
|
||||
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
// control the output
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||
public boolean GENOTYPE_MODE = false;
|
||||
|
||||
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
|
||||
public boolean ALL_BASES_MODE = false;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
|
||||
|
||||
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||
|
||||
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||
public boolean NO_SLOD = false;
|
||||
|
||||
|
||||
// control the error modes
|
||||
@Hidden
|
||||
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||
public String ASSUME_SINGLE_SAMPLE = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
|
||||
// control the various parameters to be used
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public int MIN_BASE_QUALTY_SCORE = 10;
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
|
||||
public int MIN_MAPPING_QUALTY_SCORE = 10;
|
||||
|
||||
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
|
||||
public int MAX_MISMATCHES = 3;
|
||||
|
||||
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
|
||||
public boolean USE_BADLY_MATED_READS = false;
|
||||
|
||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
|
||||
public boolean CAP_BASE_QUALITY = false;
|
||||
|
||||
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.baseModel = baseModel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||
uac.ALL_BASES_MODE = ALL_BASES_MODE;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
|
||||
uac.defaultPlatform = defaultPlatform;
|
||||
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING;
|
||||
uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING;
|
||||
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||
uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE;
|
||||
uac.MAX_MISMATCHES = MAX_MISMATCHES;
|
||||
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
|
||||
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||
|
||||
return uac;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,477 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.pileup.*;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class UnifiedGenotyperEngine {
|
||||
|
||||
public static final String TRIGGER_TRACK_NAME = "trigger";
|
||||
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||
|
||||
// the unified argument collection
|
||||
private UnifiedArgumentCollection UAC = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// the model used for calculating genotypes
|
||||
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private double[] log10AlleleFrequencyPriors;
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
|
||||
|
||||
// the priors object
|
||||
private GenotypePriors genotypePriors;
|
||||
|
||||
// the various loggers and writers
|
||||
private Logger logger = null;
|
||||
private PrintStream verboseWriter = null;
|
||||
|
||||
// number of chromosomes (2 * samples) in input
|
||||
private int N;
|
||||
|
||||
// the standard filter to use for calls below the confidence threshold but above the emit threshold
|
||||
private static final Set<String> filter = new HashSet<String>(1);
|
||||
|
||||
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
// get the number of samples
|
||||
// if we're supposed to assume a single sample, do so
|
||||
int numSamples;
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
numSamples = 1;
|
||||
else
|
||||
numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
|
||||
initialize(UAC, null, null, null, numSamples);
|
||||
}
|
||||
|
||||
public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||
initialize(UAC, logger, verboseWriter, engine, numSamples);
|
||||
}
|
||||
|
||||
private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||
this.UAC = UAC;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
this.annotationEngine = engine;
|
||||
|
||||
N = 2 * numSamples;
|
||||
log10AlleleFrequencyPriors = new double[N+1];
|
||||
computeAlleleFrequencyPriors(N);
|
||||
genotypePriors = createGenotypePriors(UAC);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
|
||||
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||
if ( glcm.get() == null ) {
|
||||
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||
log10AlleleFrequencyPosteriors.set(new double[N+1]);
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
}
|
||||
|
||||
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter);
|
||||
if ( stratifiedContexts == null )
|
||||
return null;
|
||||
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
|
||||
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, genotypePriors, GLs);
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( GLs.size() == 0 )
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false);
|
||||
|
||||
// reset the optimization value and determine the p(AF>0)
|
||||
// TODO: get rid of this optimization, it is wrong!
|
||||
afcm.get().setMinAlleleFrequencyToTest(0);
|
||||
|
||||
// zero out the AFs above the N for this position
|
||||
ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors.get(), 2 * GLs.size());
|
||||
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// find the most likely frequency
|
||||
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
|
||||
|
||||
// calculate p(f>0)
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
sum += normalizedPosteriors[i];
|
||||
double PofF = Math.min(sum, 1.0); // deal with precision errors
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( bestAFguess != 0 ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += log10AlleleFrequencyPosteriors.get()[i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
}
|
||||
|
||||
// did we trigger on the provided track?
|
||||
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true);
|
||||
}
|
||||
|
||||
// create the genotypes
|
||||
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(stratifiedContexts, GLs, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||
|
||||
// next, get the variant context data (alleles, attributes, etc.)
|
||||
HashSet<Allele> alleles = new HashSet<Allele>();
|
||||
alleles.add(refAllele);
|
||||
for ( Genotype g : genotypes.values() )
|
||||
alleles.addAll(g.getAlleles());
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
DbSNPFeature dbsnp = getDbSNP(tracker);
|
||||
if ( dbsnp != null )
|
||||
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||
|
||||
// if the site was downsampled, record that fact
|
||||
if ( rawContext.hasPileupBeenDownsampled() )
|
||||
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||
|
||||
|
||||
if ( !UAC.NO_SLOD ) {
|
||||
|
||||
// TODO: implement me
|
||||
|
||||
//Map<String, BiallelicGenotypeLikelihoods> forwardGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD, genotypePriors);
|
||||
//Map<String, BiallelicGenotypeLikelihoods> reverseGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE, genotypePriors);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
GenomeLoc loc = refContext.getLocus();
|
||||
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||
|
||||
if ( annotationEngine != null ) {
|
||||
// first off, we want to use the *unfiltered* context for the annotations
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc);
|
||||
vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||
}
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
printVerboseData(vc, PofF, phredScaledConfidence, normalizedPosteriors);
|
||||
|
||||
VariantCallContext call = new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
|
||||
call.setRefBase(refContext.getBase());
|
||||
return call;
|
||||
}
|
||||
|
||||
private static boolean isValidDeletionFraction(double d) {
|
||||
return ( d >= 0.0 && d <= 1.0 );
|
||||
}
|
||||
|
||||
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) {
|
||||
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
|
||||
|
||||
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
|
||||
|
||||
// filter the context based on min mapping quality
|
||||
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on bad mates and mismatch rate
|
||||
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
|
||||
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
|
||||
|
||||
byte ref = refContext.getBase();
|
||||
if ( !BaseUtils.isRegularBase(ref) )
|
||||
return null;
|
||||
|
||||
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||
|
||||
// filter the context based on min base and mapping qualities
|
||||
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||
|
||||
// filter the context based on bad mates and mismatch rate
|
||||
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||
|
||||
// don't call when there is no coverage
|
||||
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||
return null;
|
||||
|
||||
// are there too many deletions in the pileup?
|
||||
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
|
||||
return null;
|
||||
|
||||
// stratify the AlignmentContext and cut by sample
|
||||
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||
}
|
||||
|
||||
return stratifiedContexts;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param AFs AF array
|
||||
* @param freqI allele frequency I
|
||||
*/
|
||||
protected static void ignoreAlleleFrequenciesAboveI(double[] AFs, int freqI) {
|
||||
while ( ++freqI < AFs.length )
|
||||
AFs[freqI] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
}
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
|
||||
|
||||
// TODO: implement me
|
||||
|
||||
double P_of_ref = 1.0;
|
||||
|
||||
// use the AF=0 prob if it's calculated
|
||||
//if ( ignoreCoveredSamples )
|
||||
// P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
|
||||
|
||||
// for each sample that we haven't examined yet
|
||||
//for ( String sample : samples ) {
|
||||
// boolean isCovered = contexts.containsKey(sample);
|
||||
// if ( ignoreCoveredSamples && isCovered )
|
||||
// continue;
|
||||
|
||||
P_of_ref = 0.5;
|
||||
// int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
|
||||
// P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||
//}
|
||||
|
||||
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
}
|
||||
|
||||
protected void printVerboseData(VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
|
||||
for (int i = 0; i <= N; i++) {
|
||||
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||
AFline.append(vc.getChr()).append(":").append(vc.getStart()).append("\t");
|
||||
AFline.append(vc.getReference()).append("\t");
|
||||
if ( vc.isBiallelic() )
|
||||
AFline.append(vc.getAlternateAllele(0)).append("\t");
|
||||
else
|
||||
AFline.append("N/A\t");
|
||||
AFline.append(i + "/" + N + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
|
||||
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
|
||||
verboseWriter.println(AFline.toString());
|
||||
}
|
||||
|
||||
verboseWriter.println("P(f>0) = " + PofF);
|
||||
verboseWriter.println("Qscore = " + phredScaledConfidence);
|
||||
verboseWriter.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* Filters low quality reads out of the pileup.
|
||||
*/
|
||||
private class BadReadPileupFilter implements PileupElementFilter {
|
||||
private ReferenceContext refContext;
|
||||
|
||||
public BadReadPileupFilter(ReferenceContext ref) {
|
||||
// create the +/-20bp window
|
||||
GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20));
|
||||
byte[] bases = new byte[41];
|
||||
System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size());
|
||||
refContext = new ReferenceContext(ref.getLocus(), window, bases);
|
||||
}
|
||||
|
||||
public boolean allow(PileupElement pileupElement) {
|
||||
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
|
||||
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker rod data
|
||||
*
|
||||
* @return the dbsnp rod if there is one at this position
|
||||
*/
|
||||
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
|
||||
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
}
|
||||
|
||||
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
|
||||
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
|
||||
}
|
||||
|
||||
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
|
||||
return (atTriggerTrack ?
|
||||
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
|
||||
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
|
||||
}
|
||||
|
||||
protected void computeAlleleFrequencyPriors(int N) {
|
||||
// calculate sum(1/i)
|
||||
double sigma_1_over_I = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
sigma_1_over_I += 1.0 / (double)i;
|
||||
|
||||
// delta = theta / sum(1/i)
|
||||
double delta = UAC.heterozygosity / sigma_1_over_I;
|
||||
|
||||
// calculate the null allele frequencies for 1-N
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
double value = delta / (double)i;
|
||||
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
||||
// TODO: enable me
|
||||
protected void computeAlleleFrequencyPriorsCorrect(int N) {
|
||||
// calculate the allele frequency priors for 1-N
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
double value = UAC.heterozygosity / (double)i;
|
||||
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
||||
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
|
||||
GenotypePriors priors;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
// use flat priors for GLs
|
||||
priors = new DiploidSNPGenotypePriors();
|
||||
break;
|
||||
case DINDEL:
|
||||
// TODO: create indel priors object
|
||||
priors = null;
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
|
||||
GenotypeLikelihoodsCalculationModel glcm;
|
||||
switch ( UAC.GLmodel ) {
|
||||
case SNP:
|
||||
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
case DINDEL:
|
||||
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
AlleleFrequencyCalculationModel afcm;
|
||||
switch ( UAC.AFmodel ) {
|
||||
case EXACT:
|
||||
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
|
||||
break;
|
||||
case GRID_SEARCH:
|
||||
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||
}
|
||||
|
||||
return afcm;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
||||
|
||||
/**
|
||||
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
|
||||
* multi-sample data. The user can choose from several different incorporated calculation models.
|
||||
*/
|
||||
@Reference(window=@Window(start=-200,stop=200))
|
||||
@By(DataSource.REFERENCE)
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
|
||||
|
||||
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// control the output
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
|
||||
@Deprecated
|
||||
protected String varout;
|
||||
|
||||
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||
protected PrintStream verboseWriter = null;
|
||||
|
||||
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
|
||||
protected PrintStream metricsWriter = null;
|
||||
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// samples in input
|
||||
private Set<String> samples = new TreeSet<String>();
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||
|
||||
/**
|
||||
* Inner class for collecting output statistics from the UG
|
||||
*/
|
||||
public static class UGStatistics {
|
||||
/** The total number of passes examined -- i.e., the number of map calls */
|
||||
long nBasesVisited = 0;
|
||||
|
||||
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
|
||||
long nBasesCallable = 0;
|
||||
|
||||
/** The number of bases called confidently (according to user threshold), either ref or other */
|
||||
long nBasesCalledConfidently = 0;
|
||||
|
||||
/** The number of bases for which calls were emitted */
|
||||
long nCallsMade = 0;
|
||||
|
||||
/** The total number of extended events encountered */
|
||||
long nExtendedEvents = 0;
|
||||
|
||||
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the samples, output, and genotype calculation model
|
||||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
// get all of the unique sample names
|
||||
// if we're supposed to assume a single sample, do so
|
||||
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||
else
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
|
||||
UG_engine = new UnifiedGenotyperEngine(UAC, logger, verboseWriter, annotationEngine, samples.size());
|
||||
|
||||
// initialize the header
|
||||
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
|
||||
}
|
||||
|
||||
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// all annotation fields from VariantAnnotatorEngine
|
||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
|
||||
// annotation (INFO) fields from UnifiedGenotyper
|
||||
if ( !UAC.NO_SLOD )
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||
|
||||
// also, check to see whether comp rods were included
|
||||
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
|
||||
}
|
||||
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
|
||||
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
|
||||
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
|
||||
}
|
||||
}
|
||||
|
||||
// FORMAT and INFO fields
|
||||
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
|
||||
|
||||
// FILTER fields
|
||||
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
|
||||
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
|
||||
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||
|
||||
return headerInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute at a given locus.
|
||||
*
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||
return UG_engine.runGenotyper(tracker, refContext, rawContext);
|
||||
}
|
||||
|
||||
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||
|
||||
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
|
||||
lhs.nBasesCallable += rhs.nBasesCallable;
|
||||
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
|
||||
lhs.nBasesVisited += rhs.nBasesVisited;
|
||||
lhs.nCallsMade += rhs.nCallsMade;
|
||||
return lhs;
|
||||
}
|
||||
|
||||
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
|
||||
// we get a point for reaching reduce
|
||||
sum.nBasesVisited++;
|
||||
|
||||
// can't call the locus because of no coverage
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
// A call was attempted -- the base was potentially callable
|
||||
sum.nBasesCallable++;
|
||||
|
||||
// the base was confidently callable
|
||||
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
|
||||
|
||||
// can't make a confident variant call here
|
||||
if ( value.vc == null )
|
||||
return sum;
|
||||
|
||||
try {
|
||||
// we are actually making a call
|
||||
sum.nCallsMade++;
|
||||
writer.add(value.vc, value.refBase);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(UGStatistics sum) {
|
||||
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
|
||||
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
|
||||
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||
logger.info(String.format("Actual calls made %d", sum.nCallsMade));
|
||||
|
||||
if ( metricsWriter != null ) {
|
||||
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));
|
||||
metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable));
|
||||
metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||
metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||
metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||
metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||
metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo, ebanks
|
||||
* Date: Jan 22, 2010
|
||||
* Time: 2:25:19 PM
|
||||
*
|
||||
* Useful helper class to communicate the results of calculateGenotype to framework
|
||||
*/
|
||||
public class VariantCallContext {
|
||||
public VariantContext vc = null;
|
||||
public byte refBase;
|
||||
|
||||
// Was the site called confidently, either reference or variant?
|
||||
public boolean confidentlyCalled = false;
|
||||
|
||||
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
|
||||
this.vc = vc;
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) {
|
||||
this.vc = vc;
|
||||
this.refBase = ref;
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
// blank variant context => we're a ref site
|
||||
VariantCallContext(boolean confidentlyCalledP) {
|
||||
this.confidentlyCalled = confidentlyCalledP;
|
||||
}
|
||||
|
||||
public void setRefBase(byte ref) {
|
||||
this.refBase = ref;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue