Initial version of refactored Unified Genotyper. Using SNP genotype likelihoods and GRID_SEARCH AF estimation models, achieves the exact same results as original UG on 1-2 samples with the exception of strand bias (not implemented yet); other than that I have no idea. Needs tons more testing. Do not use. For Guillermo only.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4377 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6df7f9318f
commit
0ec07ad99a
|
|
@ -0,0 +1,355 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||||
|
import org.broad.tribble.vcf.VCFConstants;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The model representing how we calculate a genotype given the priors and a pile
|
||||||
|
* of bases and quality scores
|
||||||
|
*/
|
||||||
|
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
|
|
||||||
|
public enum Model {
|
||||||
|
EXACT,
|
||||||
|
GRID_SEARCH
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int N;
|
||||||
|
protected AlleleFrequencyMatrix AFMatrix;
|
||||||
|
protected Set<BiallelicGenotypeLikelihoods> refCalls;
|
||||||
|
|
||||||
|
protected Logger logger;
|
||||||
|
protected PrintStream verboseWriter;
|
||||||
|
|
||||||
|
private int minAlleleFrequencyToTest;
|
||||||
|
|
||||||
|
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
|
this.N = N;
|
||||||
|
this.logger = logger;
|
||||||
|
this.verboseWriter = verboseWriter;
|
||||||
|
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||||
|
refCalls = new HashSet<BiallelicGenotypeLikelihoods>();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Must be overridden by concrete subclasses
|
||||||
|
* @param tracker rod data
|
||||||
|
* @param ref reference context
|
||||||
|
* @param GLs genotype likelihoods
|
||||||
|
* @param log10AlleleFrequencyPriors priors
|
||||||
|
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||||
|
*/
|
||||||
|
public abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
double[] log10AlleleFrequencyPriors,
|
||||||
|
double[] log10AlleleFrequencyPosteriors);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Can be overridden by concrete subclasses
|
||||||
|
* @param contexts alignment contexts
|
||||||
|
* @param GLs genotype likelihoods
|
||||||
|
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||||
|
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||||
|
*
|
||||||
|
* @return calls
|
||||||
|
*/
|
||||||
|
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
|
int AFofMaxLikelihood) {
|
||||||
|
initializeAFMatrix(GLs);
|
||||||
|
|
||||||
|
// increment the grid
|
||||||
|
for (int i = 1; i <= AFofMaxLikelihood; i++) {
|
||||||
|
// add one more alternate allele
|
||||||
|
AFMatrix.incrementFrequency();
|
||||||
|
}
|
||||||
|
|
||||||
|
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: get rid of this optimization, it is wrong!
|
||||||
|
protected int getMinAlleleFrequencyToTest() {
|
||||||
|
return minAlleleFrequencyToTest;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setMinAlleleFrequencyToTest(int minAF) {
|
||||||
|
minAlleleFrequencyToTest = minAF;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
int frequency) {
|
||||||
|
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||||
|
|
||||||
|
// first, the potential alt calls
|
||||||
|
for ( String sample : AFMatrix.getSamples() ) {
|
||||||
|
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||||
|
Allele alleleA = GL.getAlleleA();
|
||||||
|
Allele alleleB = GL.getAlleleB();
|
||||||
|
|
||||||
|
// set the genotype and confidence
|
||||||
|
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(frequency, sample);
|
||||||
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||||
|
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
}
|
||||||
|
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||||
|
|
||||||
|
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||||
|
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||||
|
|
||||||
|
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
// now, the clearly ref calls
|
||||||
|
for ( BiallelicGenotypeLikelihoods GL : refCalls ) {
|
||||||
|
String sample = GL.getSample();
|
||||||
|
|
||||||
|
Allele ref = GL.getAlleleA();
|
||||||
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||||
|
myAlleles.add(ref);
|
||||||
|
myAlleles.add(ref);
|
||||||
|
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size());
|
||||||
|
|
||||||
|
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||||
|
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||||
|
|
||||||
|
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
|
||||||
|
|
||||||
|
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||||
|
refCalls.clear();
|
||||||
|
AFMatrix.clear();
|
||||||
|
|
||||||
|
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||||
|
if ( isClearRefCall(GL) ) {
|
||||||
|
refCalls.add(GL);
|
||||||
|
} else {
|
||||||
|
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) {
|
||||||
|
if ( GL.getAlleleA().isNonReference() )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
double[] likelihoods = GL.getLikelihoods();
|
||||||
|
return ( likelihoods[0] > likelihoods[1] && likelihoods[0] > likelihoods[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected class CalculatedAlleleFrequency {
|
||||||
|
|
||||||
|
public double log10PNonRef;
|
||||||
|
public int altAlleles;
|
||||||
|
|
||||||
|
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
|
||||||
|
this.log10PNonRef = log10PNonRef;
|
||||||
|
this.altAlleles = altAlleles;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private enum GenotypeType { AA, AB, BB }
|
||||||
|
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||||
|
|
||||||
|
protected static class AlleleFrequencyMatrix {
|
||||||
|
|
||||||
|
private double[][] matrix; // allele frequency matrix
|
||||||
|
private int[] indexes; // matrix to maintain which genotype is active
|
||||||
|
private int maxN; // total possible frequencies in data
|
||||||
|
private int frequency; // current frequency
|
||||||
|
|
||||||
|
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||||
|
private ArrayList<String> samples = new ArrayList<String>();
|
||||||
|
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||||
|
|
||||||
|
public AlleleFrequencyMatrix(int N) {
|
||||||
|
maxN = N;
|
||||||
|
matrix = new double[N][3];
|
||||||
|
indexes = new int[N];
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSamples() { return samples; }
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
frequency = 0;
|
||||||
|
for (int i = 0; i < maxN; i++)
|
||||||
|
indexes[i] = 0;
|
||||||
|
samples.clear();
|
||||||
|
samplesToGenotypesPerAF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||||
|
int index = samples.size();
|
||||||
|
samples.add(sample);
|
||||||
|
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
||||||
|
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
||||||
|
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLikelihoods(double[] GLs, String sample) {
|
||||||
|
int index = samples.size();
|
||||||
|
samples.add(sample);
|
||||||
|
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||||
|
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||||
|
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementFrequency() {
|
||||||
|
int N = samples.size();
|
||||||
|
if ( frequency == 2 * N )
|
||||||
|
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||||
|
frequency++;
|
||||||
|
|
||||||
|
double greedy = VALUE_NOT_CALCULATED;
|
||||||
|
int greedyIndex = -1;
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
|
||||||
|
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||||
|
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||||
|
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||||
|
greedyIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||||
|
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||||
|
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||||
|
greedyIndex = i;
|
||||||
|
}
|
||||||
|
// note that we currently don't bother with breaking ties between samples
|
||||||
|
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||||
|
// unlikely that a collision will both occur and that the difference will
|
||||||
|
// be significant at HOM_VAR...
|
||||||
|
}
|
||||||
|
// if this person is already hom var, he can't add another alternate allele
|
||||||
|
// so we can ignore that case
|
||||||
|
}
|
||||||
|
if ( greedyIndex == -1 )
|
||||||
|
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||||
|
|
||||||
|
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||||
|
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||||
|
else
|
||||||
|
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getLikelihoodsOfFrequency() {
|
||||||
|
double likelihoods = 0.0;
|
||||||
|
int N = samples.size();
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
likelihoods += matrix[i][indexes[i]];
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.println(frequency);
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
for (int j=0; j < 3; j++) {
|
||||||
|
System.out.print(String.valueOf(matrix[i][j]));
|
||||||
|
System.out.print(indexes[i] == j ? "* " : " ");
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
System.out.println(likelihoods);
|
||||||
|
System.out.println();
|
||||||
|
*/
|
||||||
|
|
||||||
|
recordGenotypes();
|
||||||
|
|
||||||
|
return likelihoods;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||||
|
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void recordGenotypes() {
|
||||||
|
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
for ( String sample : samples ) {
|
||||||
|
int genotype = indexes[index];
|
||||||
|
|
||||||
|
double score;
|
||||||
|
|
||||||
|
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||||
|
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||||
|
if ( genotype == maxEntry ) {
|
||||||
|
if ( genotype == GenotypeType.AA.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||||
|
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||||
|
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||||
|
}
|
||||||
|
// otherwise, we need to calculate the probability of the genotype
|
||||||
|
else {
|
||||||
|
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||||
|
double chosenGenotype = normalized[genotype];
|
||||||
|
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
public enum BaseMismatchModel {
|
||||||
|
ONE_STATE,
|
||||||
|
THREE_STATE,
|
||||||
|
EMPIRICAL
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,132 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
|
||||||
|
public class BiallelicGenotypeLikelihoods {
|
||||||
|
|
||||||
|
private String sample;
|
||||||
|
private double[] GLs;
|
||||||
|
private double[] GPs;
|
||||||
|
private Allele A, B;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new object for sample with given alleles and genotype likelihoods
|
||||||
|
*
|
||||||
|
* @param sample sample name
|
||||||
|
* @param A allele A
|
||||||
|
* @param B allele B
|
||||||
|
* @param log10AALikelihoods AA likelihoods
|
||||||
|
* @param log10ABLikelihoods AB likelihoods
|
||||||
|
* @param log10BBLikelihoods BB likelihoods
|
||||||
|
*/
|
||||||
|
public BiallelicGenotypeLikelihoods(String sample,
|
||||||
|
Allele A,
|
||||||
|
Allele B,
|
||||||
|
double log10AALikelihoods,
|
||||||
|
double log10ABLikelihoods,
|
||||||
|
double log10BBLikelihoods) {
|
||||||
|
this.sample = sample;
|
||||||
|
this.A = A;
|
||||||
|
this.B = B;
|
||||||
|
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||||
|
this.GPs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new object for sample with given alleles and genotype likelihoods & posteriors
|
||||||
|
*
|
||||||
|
* @param sample sample name
|
||||||
|
* @param A allele A
|
||||||
|
* @param B allele B
|
||||||
|
* @param log10AALikelihoods AA likelihoods
|
||||||
|
* @param log10ABLikelihoods AB likelihoods
|
||||||
|
* @param log10BBLikelihoods BB likelihoods
|
||||||
|
* @param log10AAPosteriors AA posteriors
|
||||||
|
* @param log10ABPosteriors AB posteriors
|
||||||
|
* @param log10BBPosteriors BB posteriors
|
||||||
|
*/
|
||||||
|
public BiallelicGenotypeLikelihoods(String sample,
|
||||||
|
Allele A,
|
||||||
|
Allele B,
|
||||||
|
double log10AALikelihoods,
|
||||||
|
double log10ABLikelihoods,
|
||||||
|
double log10BBLikelihoods,
|
||||||
|
double log10AAPosteriors,
|
||||||
|
double log10ABPosteriors,
|
||||||
|
double log10BBPosteriors) {
|
||||||
|
this.sample = sample;
|
||||||
|
this.A = A;
|
||||||
|
this.B = B;
|
||||||
|
this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods};
|
||||||
|
this.GPs = new double[]{log10AAPosteriors, log10ABPosteriors, log10BBPosteriors};
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSample() {
|
||||||
|
return sample;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getAALikelihoods() {
|
||||||
|
return GLs[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getABLikelihoods() {
|
||||||
|
return GLs[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getBBLikelihoods() {
|
||||||
|
return GLs[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double[] getLikelihoods() {
|
||||||
|
return GLs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getAAPosteriors() {
|
||||||
|
return GPs[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getABPosteriors() {
|
||||||
|
return GPs[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getBBPosteriors() {
|
||||||
|
return GPs[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double[] getPosteriors() {
|
||||||
|
return GPs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele getAlleleA() {
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele getAlleleB() {
|
||||||
|
return B;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||||
|
|
||||||
|
protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||||
|
super(UAC, logger);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||||
|
GenotypePriors priors,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||||
|
// TODO: check to make sure the priors instanceof a valid priors class
|
||||||
|
|
||||||
|
// TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption
|
||||||
|
|
||||||
|
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||||
|
// TODO: fill me in
|
||||||
|
|
||||||
|
//GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: return the reference Allele
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,534 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||||
|
|
||||||
|
import static java.lang.Math.log10;
|
||||||
|
import static java.lang.Math.pow;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||||
|
* and posteriors given a pile of bases and quality scores
|
||||||
|
*
|
||||||
|
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||||
|
* calculates:
|
||||||
|
*
|
||||||
|
* P(G | D) = P(G) * P(D | G)
|
||||||
|
*
|
||||||
|
* where
|
||||||
|
*
|
||||||
|
* P(D | G) = sum_i log10 P(bi | G)
|
||||||
|
*
|
||||||
|
* and
|
||||||
|
*
|
||||||
|
* P(bi | G) = 1 - P(error | q1) if bi is in G
|
||||||
|
* = P(error | q1) / 3 if bi is not in G
|
||||||
|
*
|
||||||
|
* for homozygous genotypes and for heterozygous genotypes:
|
||||||
|
*
|
||||||
|
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
|
||||||
|
* = P(error | q1) / 3 if bi is not in G
|
||||||
|
*
|
||||||
|
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
|
||||||
|
*
|
||||||
|
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
|
||||||
|
*
|
||||||
|
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
|
||||||
|
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
|
||||||
|
* model.
|
||||||
|
*/
|
||||||
|
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
||||||
|
protected final static int FIXED_PLOIDY = 2;
|
||||||
|
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
|
||||||
|
|
||||||
|
protected boolean enableCacheFlag = true;
|
||||||
|
protected boolean VERBOSE = false;
|
||||||
|
|
||||||
|
//
|
||||||
|
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||||
|
//
|
||||||
|
protected double[] log10Likelihoods = null;
|
||||||
|
protected double[] log10Posteriors = null;
|
||||||
|
|
||||||
|
protected DiploidSNPGenotypePriors priors = null;
|
||||||
|
|
||||||
|
protected FourBaseLikelihoods fourBaseLikelihoods = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||||
|
*
|
||||||
|
* @param m base model
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m) {
|
||||||
|
this.priors = new DiploidSNPGenotypePriors();
|
||||||
|
initialize(m, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||||
|
*
|
||||||
|
* @param m base model
|
||||||
|
* @param pl default platform
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||||
|
this.priors = new DiploidSNPGenotypePriors();
|
||||||
|
initialize(m, pl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||||
|
*
|
||||||
|
* @param m base model
|
||||||
|
* @param priors priors
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors) {
|
||||||
|
this.priors = priors;
|
||||||
|
initialize(m, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GenotypeLikelhoods object with given priors for each diploid genotype
|
||||||
|
*
|
||||||
|
* @param m base model
|
||||||
|
* @param priors priors
|
||||||
|
* @param pl default platform
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||||
|
this.priors = priors;
|
||||||
|
initialize(m, pl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cloning of the object
|
||||||
|
* @return clone
|
||||||
|
* @throws CloneNotSupportedException
|
||||||
|
*/
|
||||||
|
protected Object clone() throws CloneNotSupportedException {
|
||||||
|
DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone();
|
||||||
|
c.priors = priors;
|
||||||
|
c.log10Likelihoods = log10Likelihoods.clone();
|
||||||
|
c.log10Posteriors = log10Posteriors.clone();
|
||||||
|
c.fourBaseLikelihoods = (FourBaseLikelihoods)fourBaseLikelihoods.clone();
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) {
|
||||||
|
fourBaseLikelihoods = FourBaseLikelihoodsFactory.makeFourBaseLikelihoods(m, pl);
|
||||||
|
setToZero();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setToZero() {
|
||||||
|
log10Likelihoods = zeros.clone(); // likelihoods are all zeros
|
||||||
|
log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerbose(boolean v) {
|
||||||
|
VERBOSE = v;
|
||||||
|
fourBaseLikelihoods.setVerbose(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerbose() {
|
||||||
|
return VERBOSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinQScoreToInclude() {
|
||||||
|
return fourBaseLikelihoods.getMinQScoreToInclude();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||||
|
fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||||
|
* @return likelihoods array
|
||||||
|
*/
|
||||||
|
public double[] getLikelihoods() {
|
||||||
|
return log10Likelihoods;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the likelihood associated with DiploidGenotype g
|
||||||
|
* @param g genotype
|
||||||
|
* @return log10 likelihood as a double
|
||||||
|
*/
|
||||||
|
public double getLikelihood(DiploidGenotype g) {
|
||||||
|
return getLikelihoods()[g.ordinal()];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||||
|
*
|
||||||
|
* @return raw log10 (not-normalized posteriors) as a double array
|
||||||
|
*/
|
||||||
|
public double[] getPosteriors() {
|
||||||
|
return log10Posteriors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the posterior associated with DiploidGenotype g
|
||||||
|
* @param g genotpe
|
||||||
|
* @return raw log10 (not-normalized posteror) as a double
|
||||||
|
*/
|
||||||
|
public double getPosterior(DiploidGenotype g) {
|
||||||
|
return getPosteriors()[g.ordinal()];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||||
|
*
|
||||||
|
* @return normalized posterors as a double array
|
||||||
|
*/
|
||||||
|
public double[] getNormalizedPosteriors() {
|
||||||
|
double[] normalized = new double[log10Posteriors.length];
|
||||||
|
double sum = 0.0;
|
||||||
|
|
||||||
|
// for precision purposes, we need to add (or really subtract, since everything is negative)
|
||||||
|
// the largest posterior value from all entries so that numbers don't get too small
|
||||||
|
double maxValue = log10Posteriors[0];
|
||||||
|
for (int i = 1; i < log10Posteriors.length; i++) {
|
||||||
|
if ( maxValue < log10Posteriors[i] )
|
||||||
|
maxValue = log10Posteriors[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// collect the posteriors
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
double posterior = Math.pow(10, getPosterior(g) - maxValue);
|
||||||
|
normalized[g.ordinal()] = posterior;
|
||||||
|
sum += posterior;
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
for (int i = 0; i < normalized.length; i++)
|
||||||
|
normalized[i] /= sum;
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public DiploidSNPGenotypePriors getPriorObject() {
|
||||||
|
return priors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||||
|
*
|
||||||
|
* @return log10 prior as a double array
|
||||||
|
*/
|
||||||
|
public double[] getPriors() {
|
||||||
|
return priors.getPriors();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the priors
|
||||||
|
* @param priors priors
|
||||||
|
*/
|
||||||
|
public void setPriors(DiploidSNPGenotypePriors priors) {
|
||||||
|
this.priors = priors;
|
||||||
|
log10Posteriors = zeros.clone();
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
int i = g.ordinal();
|
||||||
|
log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the prior associated with DiploidGenotype g
|
||||||
|
* @param g genotype
|
||||||
|
* @return log10 prior as a double
|
||||||
|
*/
|
||||||
|
public double getPrior(DiploidGenotype g) {
|
||||||
|
return getPriors()[g.ordinal()];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple function to overload to control the caching of genotype likelihood outputs.
|
||||||
|
* By default the function trues true -- do enable caching. If you are experimenting with an
|
||||||
|
* complex calcluation of P(B | G) and caching might not work correctly for you, overload this
|
||||||
|
* function and return false, so the super() object won't try to cache your GL calculations.
|
||||||
|
*
|
||||||
|
* @return true if caching should be enabled, false otherwise
|
||||||
|
*/
|
||||||
|
public boolean cacheIsEnabled() {
|
||||||
|
return enableCacheFlag;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnableCacheFlag(boolean enable) {
|
||||||
|
enableCacheFlag = enable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int add(ReadBackedPileup pileup) {
|
||||||
|
return add(pileup, false, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||||
|
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||||
|
* pileup
|
||||||
|
*
|
||||||
|
* @param pileup read pileup
|
||||||
|
* @param ignoreBadBases should we ignore bad bases?
|
||||||
|
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||||
|
* @return the number of good bases found in the pileup
|
||||||
|
*/
|
||||||
|
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) {
|
||||||
|
int n = 0;
|
||||||
|
|
||||||
|
for ( PileupElement p : pileup ) {
|
||||||
|
// ignore deletions
|
||||||
|
if ( p.isDeletion() )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
byte base = p.getBase();
|
||||||
|
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||||
|
byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual();
|
||||||
|
n += add(base, qual, p.getRead(), p.getOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||||
|
|
||||||
|
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||||
|
DiploidSNPGenotypeLikelihoods gl;
|
||||||
|
if ( cacheIsEnabled() ) {
|
||||||
|
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) {
|
||||||
|
gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset);
|
||||||
|
} else {
|
||||||
|
gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// for bad bases, there are no likelihoods
|
||||||
|
if ( gl == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
double[] likelihoods = gl.getLikelihoods();
|
||||||
|
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
double likelihood = likelihoods[g.ordinal()];
|
||||||
|
|
||||||
|
if ( VERBOSE ) {
|
||||||
|
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||||
|
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||||
|
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||||
|
}
|
||||||
|
|
||||||
|
log10Likelihoods[g.ordinal()] += likelihood;
|
||||||
|
log10Posteriors[g.ordinal()] += likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static DiploidSNPGenotypeLikelihoods[][][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2];
|
||||||
|
|
||||||
|
protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||||
|
return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||||
|
DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read);
|
||||||
|
if ( gl == null )
|
||||||
|
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||||
|
observedBase, qualityScore, ploidy, read));
|
||||||
|
return gl;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) {
|
||||||
|
DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset);
|
||||||
|
setCache(CACHE, observedBase, qualityScore, ploidy, read, gl);
|
||||||
|
return gl;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
|
||||||
|
byte observedBase, byte qualityScore, int ploidy,
|
||||||
|
SAMRecord read, DiploidSNPGenotypeLikelihoods val ) {
|
||||||
|
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||||
|
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||||
|
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||||
|
int j = qualityScore;
|
||||||
|
int k = ploidy;
|
||||||
|
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||||
|
|
||||||
|
cache[m][a][i][j][k][x] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache,
|
||||||
|
byte observedBase, byte qualityScore, int ploidy, SAMRecord read) {
|
||||||
|
int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal();
|
||||||
|
int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read);
|
||||||
|
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||||
|
int j = qualityScore;
|
||||||
|
int k = ploidy;
|
||||||
|
int x = strandIndex(! read.getReadNegativeStrandFlag() );
|
||||||
|
return cache[m][a][i][j][k][x];
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||||
|
FourBaseLikelihoods fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||||
|
if ( fbl == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
double[] fbLikelihoods = fbl.getLog10Likelihoods();
|
||||||
|
try {
|
||||||
|
|
||||||
|
DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone();
|
||||||
|
gl.setToZero();
|
||||||
|
|
||||||
|
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
|
||||||
|
double ploidyAdjustment = log10(FIXED_PLOIDY);
|
||||||
|
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
|
||||||
|
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
|
||||||
|
double p_base = 0.0;
|
||||||
|
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
|
||||||
|
p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
|
||||||
|
double likelihood = log10(p_base);
|
||||||
|
|
||||||
|
gl.log10Likelihoods[g.ordinal()] += likelihood;
|
||||||
|
gl.log10Posteriors[g.ordinal()] += likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( VERBOSE ) {
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||||
|
System.out.println();
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
return gl;
|
||||||
|
|
||||||
|
} catch ( CloneNotSupportedException e ) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// helper routines
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public static int strandIndex(boolean fwdStrand) {
|
||||||
|
return fwdStrand ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||||
|
* is considered bad if:
|
||||||
|
*
|
||||||
|
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @return true if the base is a bad base
|
||||||
|
*/
|
||||||
|
protected boolean badBase(byte observedBase) {
|
||||||
|
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a string representation of this object in a moderately usable form
|
||||||
|
*
|
||||||
|
* @return string representation
|
||||||
|
*/
|
||||||
|
public String toString() {
|
||||||
|
double sum = 0;
|
||||||
|
StringBuilder s = new StringBuilder();
|
||||||
|
for (DiploidGenotype g : DiploidGenotype.values()) {
|
||||||
|
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
|
||||||
|
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
|
||||||
|
}
|
||||||
|
s.append(String.format(" %f", sum));
|
||||||
|
return s.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Validation routines
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public boolean validate() {
|
||||||
|
return validate(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean validate(boolean throwException) {
|
||||||
|
try {
|
||||||
|
priors.validate(throwException);
|
||||||
|
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
String bad = null;
|
||||||
|
|
||||||
|
int i = g.ordinal();
|
||||||
|
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||||
|
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||||
|
} else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) {
|
||||||
|
bad = String.format("Posterior %f is badly formed", log10Posteriors[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( bad != null ) {
|
||||||
|
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( IllegalStateException e ) {
|
||||||
|
if ( throwException )
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Constant static data
|
||||||
|
//
|
||||||
|
final static double[] zeros = new double[DiploidGenotype.values().length];
|
||||||
|
|
||||||
|
static {
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
zeros[g.ordinal()] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,257 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class DiploidSNPGenotypePriors implements GenotypePriors {
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Constants and static information
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
|
||||||
|
public static final double CEU_HETEROZYGOSITY = 1e-3;
|
||||||
|
public static final double YRI_HETEROZYGOSITY = 1.0 / 850;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default value of the prob of seeing a reference error. Used to calculation the
|
||||||
|
* chance of seeing a true B/C het when the reference is A, which we assume is the product
|
||||||
|
* of the ref error rate and the het. Default value is Q60
|
||||||
|
*/
|
||||||
|
public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is
|
||||||
|
|
||||||
|
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Diploid priors
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
private double[] priors = null;
|
||||||
|
|
||||||
|
// todo -- fix me when this issue is resolved
|
||||||
|
public static final boolean requirePriorSumToOne = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypePriors() {
|
||||||
|
priors = flatPriors.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
|
||||||
|
* base ref
|
||||||
|
*
|
||||||
|
* @param ref
|
||||||
|
* @param heterozygosity
|
||||||
|
* @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) {
|
||||||
|
priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
|
||||||
|
*
|
||||||
|
* @param log10Priors
|
||||||
|
*/
|
||||||
|
public DiploidSNPGenotypePriors(double[] log10Priors) {
|
||||||
|
priors = log10Priors.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||||
|
*
|
||||||
|
* @return log10 prior as a double array
|
||||||
|
*/
|
||||||
|
public double[] getPriors() {
|
||||||
|
return priors;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the prior associated with DiploidGenotype g
|
||||||
|
* @param g
|
||||||
|
* @return log10 prior as a double
|
||||||
|
*/
|
||||||
|
public double getPrior(DiploidGenotype g) {
|
||||||
|
return getPriors()[g.ordinal()];
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getHeterozygosity() { return HUMAN_HETEROZYGOSITY; }
|
||||||
|
|
||||||
|
public boolean validate(boolean throwException) {
|
||||||
|
try {
|
||||||
|
if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
|
||||||
|
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
int i = g.ordinal();
|
||||||
|
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||||
|
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||||
|
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( IllegalStateException e ) {
|
||||||
|
if ( throwException )
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Static functionality
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
|
||||||
|
* value, as elements 0, 1, and 2 of a double[], respectively
|
||||||
|
*
|
||||||
|
* @param h the heterozygosity [probability of a base being heterozygous]
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public static double[] heterozygosity2DiploidProbabilities(double h) {
|
||||||
|
double[] pdbls = new double[3];
|
||||||
|
|
||||||
|
pdbls[0] = heterozygosity2HomRefProbability(h);
|
||||||
|
pdbls[1] = heterozygosity2HetProbability(h);
|
||||||
|
pdbls[2] = heterozygosity2HomVarProbability(h);
|
||||||
|
return pdbls;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param h
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static double heterozygosity2HomRefProbability(double h) {
|
||||||
|
if (MathUtils.isNegative(h)) {
|
||||||
|
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||||
|
}
|
||||||
|
|
||||||
|
double v = 1.0 - (3.0 * h / 2.0);
|
||||||
|
if (MathUtils.isNegative(v)) {
|
||||||
|
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||||
|
}
|
||||||
|
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static double heterozygosity2HetProbability(double h) {
|
||||||
|
if (MathUtils.isNegative(h)) {
|
||||||
|
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||||
|
}
|
||||||
|
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static double heterozygosity2HomVarProbability(double h) {
|
||||||
|
if (MathUtils.isNegative(h)) {
|
||||||
|
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||||
|
}
|
||||||
|
|
||||||
|
return h / 2.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||||
|
* appropriately.
|
||||||
|
*
|
||||||
|
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
|
||||||
|
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
|
||||||
|
* then this sets the priors to:
|
||||||
|
*
|
||||||
|
* AA = hom-ref
|
||||||
|
* AC = AG = AT = (het - pTriStateGenotype) / 3
|
||||||
|
* CC = GG = TT = hom-var / 3
|
||||||
|
* CG = CT = GT = pTriStateGenotype / 3
|
||||||
|
*
|
||||||
|
* So that we get:
|
||||||
|
*
|
||||||
|
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
|
||||||
|
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
|
||||||
|
* hom-ref + het + hom-var
|
||||||
|
* = 1
|
||||||
|
*
|
||||||
|
* @param ref
|
||||||
|
* @param heterozyosity
|
||||||
|
* @param pRefError
|
||||||
|
*/
|
||||||
|
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
|
||||||
|
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
|
||||||
|
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
|
||||||
|
}
|
||||||
|
|
||||||
|
double pTriStateGenotype = heterozyosity * pRefError;
|
||||||
|
// if ( pTriStateGenotype >= heterozyosity ) {
|
||||||
|
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
|
||||||
|
// }
|
||||||
|
|
||||||
|
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
|
||||||
|
double pHet = heterozygosity2HetProbability(heterozyosity);
|
||||||
|
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
|
||||||
|
|
||||||
|
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
|
||||||
|
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
|
||||||
|
}
|
||||||
|
|
||||||
|
double[] priors = new double[DiploidGenotype.values().length];
|
||||||
|
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
double POfG;
|
||||||
|
|
||||||
|
final double nOnRefHets = 3;
|
||||||
|
final double nOffRefHets = 3;
|
||||||
|
final double nHomVars = 3;
|
||||||
|
|
||||||
|
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
|
||||||
|
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
|
||||||
|
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
|
||||||
|
else { POfG = pTriStateGenotype / nOffRefHets; }
|
||||||
|
|
||||||
|
priors[g.ordinal()] = Math.log10(POfG);
|
||||||
|
}
|
||||||
|
|
||||||
|
return priors;
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,301 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
|
import static java.lang.Math.log10;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
|
||||||
|
public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods {
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Static methods to manipulate machine platforms
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
public enum SequencerPlatform {
|
||||||
|
SOLEXA, // Solexa / Illumina
|
||||||
|
ROCHE454, // 454
|
||||||
|
SOLID, // SOLiD
|
||||||
|
CG, // Complete Genomics
|
||||||
|
UNKNOWN // No idea -- defaulting to 1/3
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TreeMap<String, SequencerPlatform> PLFieldToSequencerPlatform = new TreeMap<String, SequencerPlatform>();
|
||||||
|
private static void bind(String s, SequencerPlatform x) {
|
||||||
|
PLFieldToSequencerPlatform.put(s, x);
|
||||||
|
PLFieldToSequencerPlatform.put(s.toUpperCase(), x);
|
||||||
|
PLFieldToSequencerPlatform.put(s.toLowerCase(), x);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Static list of platforms supported by this system.
|
||||||
|
//
|
||||||
|
static {
|
||||||
|
bind("LS454", SequencerPlatform.ROCHE454);
|
||||||
|
bind("454", SequencerPlatform.ROCHE454);
|
||||||
|
bind("ILLUMINA", SequencerPlatform.SOLEXA);
|
||||||
|
bind("SOLEXA", SequencerPlatform.SOLEXA);
|
||||||
|
bind("SOLID", SequencerPlatform.SOLID);
|
||||||
|
bind("ABI_SOLID", SequencerPlatform.SOLID);
|
||||||
|
bind("CG", SequencerPlatform.CG);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) {
|
||||||
|
String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase();
|
||||||
|
if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) {
|
||||||
|
return PLFieldToSequencerPlatform.get(lcSequencerString);
|
||||||
|
} else {
|
||||||
|
return SequencerPlatform.UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ThreadLocal<SAMRecord> lastReadForPL = new ThreadLocal<SAMRecord>();
|
||||||
|
private static ThreadLocal<SequencerPlatform> plOfLastRead = new ThreadLocal<SequencerPlatform>();
|
||||||
|
public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) {
|
||||||
|
if ( lastReadForPL.get() != read ) {
|
||||||
|
lastReadForPL.set(read);
|
||||||
|
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||||
|
final String platformName = readGroup == null ? null : readGroup.getPlatform();
|
||||||
|
plOfLastRead.set(standardizeSequencerPlatform(platformName));
|
||||||
|
}
|
||||||
|
|
||||||
|
return plOfLastRead.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||||
|
return getReadSequencerPlatform(read).ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Static methods to get at the transition tables themselves
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A matrix of value i x j -> log10(p) where
|
||||||
|
*
|
||||||
|
* i - byte of the miscalled base (i.e., A)
|
||||||
|
* j - byte of the presumed true base (i.e., C)
|
||||||
|
* log10p - empirical probability p that A is actually C
|
||||||
|
*
|
||||||
|
* The table is available for each technology
|
||||||
|
*/
|
||||||
|
private final static EnumMap<SequencerPlatform, double[][]> log10pTrueGivenMiscall = new EnumMap<SequencerPlatform, double[][]>(SequencerPlatform.class);
|
||||||
|
|
||||||
|
private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) {
|
||||||
|
if ( ! log10pTrueGivenMiscall.containsKey(pl) )
|
||||||
|
log10pTrueGivenMiscall.put(pl, new double[4][4]);
|
||||||
|
|
||||||
|
double[][] misCallProbs = log10pTrueGivenMiscall.get(pl);
|
||||||
|
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||||
|
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||||
|
misCallProbs[i][j] = log10(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) {
|
||||||
|
int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase);
|
||||||
|
int j = BaseUtils.simpleBaseToBaseIndex(trueBase);
|
||||||
|
|
||||||
|
double logP = log10pTrueGivenMiscall.get(pl)[i][j];
|
||||||
|
if ( logP == 0.0 )
|
||||||
|
throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase));
|
||||||
|
else
|
||||||
|
return logP;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addSolexa() {
|
||||||
|
SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addSOLiD() {
|
||||||
|
SequencerPlatform pl = SequencerPlatform.SOLID;
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void add454() {
|
||||||
|
SequencerPlatform pl = SequencerPlatform.ROCHE454;
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addCG() {
|
||||||
|
SequencerPlatform pl = SequencerPlatform.CG;
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0);
|
||||||
|
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0);
|
||||||
|
addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addUnknown() {
|
||||||
|
SequencerPlatform pl = SequencerPlatform.UNKNOWN;
|
||||||
|
for ( byte b1 : BaseUtils.BASES ) {
|
||||||
|
for ( byte b2 : BaseUtils.BASES ) {
|
||||||
|
if ( b1 != b2 )
|
||||||
|
addMisCall(pl, b1, b2, 1.0/3.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
addSolexa();
|
||||||
|
add454();
|
||||||
|
addSOLiD();
|
||||||
|
addCG();
|
||||||
|
addUnknown();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// The actual objects themselves
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
private boolean raiseErrorOnUnknownPlatform = true;
|
||||||
|
private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN;
|
||||||
|
|
||||||
|
//
|
||||||
|
// forwarding constructors -- don't do anything at all
|
||||||
|
//
|
||||||
|
public EmpiricalSubstitutionProbabilities() { super(); }
|
||||||
|
|
||||||
|
public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) {
|
||||||
|
super();
|
||||||
|
this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform;
|
||||||
|
}
|
||||||
|
|
||||||
|
public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) {
|
||||||
|
super();
|
||||||
|
|
||||||
|
if ( assumeUnknownPlatformsAreThis != null ) {
|
||||||
|
raiseErrorOnUnknownPlatform = false;
|
||||||
|
defaultPlatform = assumeUnknownPlatformsAreThis;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cloning of the object
|
||||||
|
* @return clone
|
||||||
|
* @throws CloneNotSupportedException
|
||||||
|
*/
|
||||||
|
protected Object clone() throws CloneNotSupportedException {
|
||||||
|
return super.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// calculation of p(B|GT)
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||||
|
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||||
|
SequencerPlatform pl = getReadSequencerPlatform(read);
|
||||||
|
|
||||||
|
if ( pl == SequencerPlatform.UNKNOWN ) {
|
||||||
|
if ( raiseErrorOnUnknownPlatform )
|
||||||
|
throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used.");
|
||||||
|
else {
|
||||||
|
pl = defaultPlatform;
|
||||||
|
//System.out.printf("Using default platform %s", pl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.printf("%s for %s%n", pl, read);
|
||||||
|
|
||||||
|
double log10p;
|
||||||
|
if ( fwdStrand ) {
|
||||||
|
log10p = getProbMiscallIsBase(pl, observedBase, chromBase);
|
||||||
|
} else {
|
||||||
|
log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase));
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() );
|
||||||
|
|
||||||
|
return log10p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
|
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
|
super(N, logger, verboseWriter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
double[] log10AlleleFrequencyPriors,
|
||||||
|
double[] log10AlleleFrequencyPosteriors) {
|
||||||
|
|
||||||
|
// TODO: implement me based on
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,373 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
|
||||||
|
import static java.lang.Math.log10;
|
||||||
|
import static java.lang.Math.pow;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors,
|
||||||
|
* and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods)
|
||||||
|
*
|
||||||
|
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||||
|
* calculates:
|
||||||
|
*
|
||||||
|
* P(b | D) = P(b) * P(D | b)
|
||||||
|
*
|
||||||
|
* where
|
||||||
|
*
|
||||||
|
* P(D | b) = sum_i log10 P(bi | b)
|
||||||
|
*
|
||||||
|
* and
|
||||||
|
*
|
||||||
|
* P(bi | b) = 1 - P(error | q1) if bi = b
|
||||||
|
* = P(error | q1) / 3 if bi != b
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public abstract class FourBaseLikelihoods implements Cloneable {
|
||||||
|
|
||||||
|
protected boolean enableCacheFlag = true;
|
||||||
|
|
||||||
|
//
|
||||||
|
// The fundamental data array associated with 4-base likelihoods
|
||||||
|
//
|
||||||
|
protected double[] log10Likelihoods = null;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If true, lots of output will be generated about the Likelihoods at each site
|
||||||
|
*/
|
||||||
|
private boolean verbose = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bases with Q scores below this threshold aren't included in the Likelihood calculation
|
||||||
|
*/
|
||||||
|
private int minQScoreToInclude = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new FourBaseLikelihoods object
|
||||||
|
*/
|
||||||
|
public FourBaseLikelihoods() {
|
||||||
|
log10Likelihoods = zeros.clone(); // Likelihoods are all zeros
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cloning of the object
|
||||||
|
* @return clone
|
||||||
|
* @throws CloneNotSupportedException
|
||||||
|
*/
|
||||||
|
protected Object clone() throws CloneNotSupportedException {
|
||||||
|
FourBaseLikelihoods c = (FourBaseLikelihoods)super.clone();
|
||||||
|
c.log10Likelihoods = log10Likelihoods.clone();
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVerbose(boolean v) {
|
||||||
|
verbose = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVerbose() {
|
||||||
|
return verbose;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinQScoreToInclude() {
|
||||||
|
return minQScoreToInclude;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||||
|
this.minQScoreToInclude = minQScoreToInclude;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||||
|
* @return probs
|
||||||
|
*/
|
||||||
|
public double[] getLog10Likelihoods() {
|
||||||
|
return log10Likelihoods;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the likelihood associated with a base
|
||||||
|
* @param base base
|
||||||
|
* @return log10 likelihood as a double
|
||||||
|
*/
|
||||||
|
public double getLog10Likelihood(byte base) {
|
||||||
|
return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getLog10Likelihood(int baseIndex) {
|
||||||
|
return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values()
|
||||||
|
* @return probs
|
||||||
|
*/
|
||||||
|
public double[] getLikelihoods() {
|
||||||
|
double[] probs = new double[4];
|
||||||
|
for (int i = 0; i < 4; i++)
|
||||||
|
probs[i] = Math.pow(10, log10Likelihoods[i]);
|
||||||
|
return probs;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the likelihoods associated with a base
|
||||||
|
* @param base base
|
||||||
|
* @return likelihoods as a double
|
||||||
|
*/
|
||||||
|
public double getLikelihood(byte base) {
|
||||||
|
return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base));
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getLikelihood(int baseIndex) {
|
||||||
|
return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex]));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// add() -- the heart of
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||||
|
* qualityScore.
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param qualityScore base quality
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||||
|
*/
|
||||||
|
public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||||
|
FourBaseLikelihoods fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset);
|
||||||
|
if ( fbl == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for ( byte base : BaseUtils.BASES ) {
|
||||||
|
double likelihood = fbl.getLikelihood(base);
|
||||||
|
log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( verbose ) {
|
||||||
|
for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); }
|
||||||
|
System.out.println();
|
||||||
|
for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); }
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||||
|
* qualityScore.
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param qualityScore base quality
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
|
||||||
|
*/
|
||||||
|
public FourBaseLikelihoods computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||||
|
if ( badBase(observedBase) ) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if ( qualityScore > getMinQScoreToInclude() ) {
|
||||||
|
|
||||||
|
FourBaseLikelihoods fbl = (FourBaseLikelihoods)this.clone();
|
||||||
|
fbl.log10Likelihoods = zeros.clone();
|
||||||
|
|
||||||
|
for ( byte base : BaseUtils.BASES ) {
|
||||||
|
double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset);
|
||||||
|
|
||||||
|
if ( verbose ) {
|
||||||
|
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||||
|
System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n",
|
||||||
|
observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||||
|
}
|
||||||
|
|
||||||
|
fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fbl;
|
||||||
|
}
|
||||||
|
} catch ( CloneNotSupportedException e ) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// helper routines
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||||
|
* is considered bad if:
|
||||||
|
*
|
||||||
|
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @return true if the base is a bad base
|
||||||
|
*/
|
||||||
|
private boolean badBase(byte observedBase) {
|
||||||
|
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a string representation of this object in a moderately usable form
|
||||||
|
*
|
||||||
|
* @return string representation
|
||||||
|
*/
|
||||||
|
public String toString() {
|
||||||
|
double sum = 0;
|
||||||
|
StringBuilder s = new StringBuilder();
|
||||||
|
for ( byte base : BaseUtils.BASES ) {
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex]));
|
||||||
|
sum += Math.pow(10, log10Likelihoods[baseIndex]);
|
||||||
|
}
|
||||||
|
s.append(String.format(" %f", sum));
|
||||||
|
return s.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overloads this
|
||||||
|
public int getReadSequencerPlatformIndex( SAMRecord read ) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Validation routines
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
public boolean validate() {
|
||||||
|
return validate(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean validate(boolean throwException) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
for ( byte base : BaseUtils.BASES ) {
|
||||||
|
|
||||||
|
int i = BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
|
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
|
||||||
|
String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
|
||||||
|
throw new IllegalStateException(String.format("At %s: %s", base, bad));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch ( IllegalStateException e ) {
|
||||||
|
if ( throwException )
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Hearty math calculations follow
|
||||||
|
//
|
||||||
|
// -- these should not be messed with unless you know what you are doing
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param chromBase target base
|
||||||
|
* @param qual base quality
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return log10 likelihood
|
||||||
|
*/
|
||||||
|
|
||||||
|
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) {
|
||||||
|
if (qual == 0) { // zero quals are wrong
|
||||||
|
throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s",
|
||||||
|
observedBase, chromBase, qual, offset, read));
|
||||||
|
}
|
||||||
|
|
||||||
|
double logP;
|
||||||
|
|
||||||
|
if ( observedBase == chromBase ) {
|
||||||
|
// the base is consistent with the chromosome -- it's 1 - e
|
||||||
|
//logP = oneMinusData[qual];
|
||||||
|
double e = pow(10, (qual / -10.0));
|
||||||
|
logP = log10(1.0 - e);
|
||||||
|
} else {
|
||||||
|
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||||
|
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||||
|
return logP;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Must be overridden by concrete subclasses
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param chromBase target base
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return log10 likelihood
|
||||||
|
*/
|
||||||
|
protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Constant static data
|
||||||
|
//
|
||||||
|
private final static double[] zeros = new double[BaseUtils.BASES.length];
|
||||||
|
|
||||||
|
static {
|
||||||
|
for ( byte base : BaseUtils.BASES ) {
|
||||||
|
zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||||
|
|
||||||
|
public class FourBaseLikelihoodsFactory {
|
||||||
|
//private FourBaseProbabilitiesFactory() {} // cannot be instantiated
|
||||||
|
|
||||||
|
public static BaseMismatchModel getBaseMismatchModel(final String name) {
|
||||||
|
return valueOf(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) {
|
||||||
|
if ( m instanceof OneStateErrorProbabilities)
|
||||||
|
return ONE_STATE;
|
||||||
|
else if ( m instanceof ThreeStateErrorProbabilities)
|
||||||
|
return THREE_STATE;
|
||||||
|
else if ( m instanceof EmpiricalSubstitutionProbabilities)
|
||||||
|
return EMPIRICAL;
|
||||||
|
|
||||||
|
throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass());
|
||||||
|
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models
|
||||||
|
*
|
||||||
|
* @param m model
|
||||||
|
* @param pl default platform
|
||||||
|
* @return new 4-base model
|
||||||
|
*/
|
||||||
|
public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m,
|
||||||
|
EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) {
|
||||||
|
switch ( m ) {
|
||||||
|
case ONE_STATE: return new OneStateErrorProbabilities();
|
||||||
|
case THREE_STATE: return new ThreeStateErrorProbabilities();
|
||||||
|
case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl);
|
||||||
|
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The model representing how we calculate genotype likelihoods
|
||||||
|
*/
|
||||||
|
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
||||||
|
|
||||||
|
public enum Model {
|
||||||
|
SNP,
|
||||||
|
DINDEL
|
||||||
|
}
|
||||||
|
|
||||||
|
protected UnifiedArgumentCollection UAC;
|
||||||
|
protected Logger logger;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new object
|
||||||
|
* @param logger logger
|
||||||
|
* @param UAC unified arg collection
|
||||||
|
*/
|
||||||
|
protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||||
|
this.UAC = UAC.clone();
|
||||||
|
this.logger = logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Must be overridden by concrete subclasses
|
||||||
|
* @param tracker rod data
|
||||||
|
* @param ref reference context
|
||||||
|
* @param contexts stratified alignment contexts
|
||||||
|
* @param contextType stratified context type
|
||||||
|
* @param priors priors to use for GLs
|
||||||
|
* @param GLs hash of sample->GL to fill in
|
||||||
|
*
|
||||||
|
* @return genotype likelihoods per sample for AA, AB, BB
|
||||||
|
*/
|
||||||
|
public abstract Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||||
|
GenotypePriors priors,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
public interface GenotypePriors {
|
||||||
|
|
||||||
|
public double[] getPriors();
|
||||||
|
|
||||||
|
public double getHeterozygosity();
|
||||||
|
|
||||||
|
public boolean validate(boolean throwException);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
|
// for use in optimizing the P(D|AF) calculations:
|
||||||
|
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||||
|
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||||
|
|
||||||
|
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
|
super(N, logger, verboseWriter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
double[] log10AlleleFrequencyPriors,
|
||||||
|
double[] log10AlleleFrequencyPosteriors) {
|
||||||
|
|
||||||
|
initializeAFMatrix(GLs);
|
||||||
|
|
||||||
|
// first, calculate for AF=0 (no change to matrix)
|
||||||
|
log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0];
|
||||||
|
double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0];
|
||||||
|
|
||||||
|
// TODO: get rid of this optimization, it is wrong!
|
||||||
|
int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest();
|
||||||
|
|
||||||
|
int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2;
|
||||||
|
// for each minor allele frequency, calculate log10PofDgivenAFi
|
||||||
|
for (int i = 1; i <= maxAlleleFrequencyToTest; i++) {
|
||||||
|
// add one more alternate allele
|
||||||
|
AFMatrix.incrementFrequency();
|
||||||
|
|
||||||
|
// calculate new likelihoods
|
||||||
|
log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i];
|
||||||
|
|
||||||
|
// an optimization to speed up the calculation: if we are beyond the local maximum such
|
||||||
|
// that subsequent likelihoods won't factor into the confidence score, just quit
|
||||||
|
if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) {
|
||||||
|
UnifiedGenotyperEngine.ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors, i);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen )
|
||||||
|
maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overrides the super class
|
||||||
|
* @param contexts alignment contexts
|
||||||
|
* @param GLs genotype likelihoods
|
||||||
|
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||||
|
*
|
||||||
|
* @return calls
|
||||||
|
*/
|
||||||
|
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
|
int AFofMaxLikelihood) {
|
||||||
|
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
|
||||||
|
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
|
||||||
|
*/
|
||||||
|
public class OneStateErrorProbabilities extends FourBaseLikelihoods {
|
||||||
|
//
|
||||||
|
// forwarding constructors -- don't do anything at all
|
||||||
|
//
|
||||||
|
public OneStateErrorProbabilities() { super(); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cloning of the object
|
||||||
|
* @return clone
|
||||||
|
* @throws CloneNotSupportedException
|
||||||
|
*/
|
||||||
|
protected Object clone() throws CloneNotSupportedException {
|
||||||
|
return super.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param chromBase target base
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return log10 likelihood
|
||||||
|
*/
|
||||||
|
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||||
|
return 0; // equivalent to e model
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,138 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.DiploidGenotype;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||||
|
|
||||||
|
// the alternate allele with the largest sum of quality scores
|
||||||
|
protected Byte bestAlternateAllele = null;
|
||||||
|
|
||||||
|
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||||
|
super(UAC, logger);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Allele getLikelihoods(RefMetaDataTracker tracker,
|
||||||
|
ReferenceContext ref,
|
||||||
|
Map<String, StratifiedAlignmentContext> contexts,
|
||||||
|
StratifiedAlignmentContext.StratifiedContextType contextType,
|
||||||
|
GenotypePriors priors,
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||||
|
|
||||||
|
if ( !(priors instanceof DiploidSNPGenotypePriors) )
|
||||||
|
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
|
||||||
|
|
||||||
|
byte refBase = ref.getBase();
|
||||||
|
Allele refAllele = Allele.create(refBase, true);
|
||||||
|
|
||||||
|
// find the alternate allele with the largest sum of quality scores
|
||||||
|
if ( contextType == StratifiedAlignmentContext.StratifiedContextType.COMPLETE )
|
||||||
|
initializeBestAlternateAllele(refBase, contexts);
|
||||||
|
|
||||||
|
// if there are no non-ref bases...
|
||||||
|
if ( bestAlternateAllele == null ) {
|
||||||
|
// did we trigger on the provided track?
|
||||||
|
boolean atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0;
|
||||||
|
|
||||||
|
// if we don't want all bases, then we don't need to calculate genotype likelihoods
|
||||||
|
if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE )
|
||||||
|
return refAllele;
|
||||||
|
|
||||||
|
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||||
|
bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C');
|
||||||
|
}
|
||||||
|
|
||||||
|
Allele altAllele = Allele.create(bestAlternateAllele, false);
|
||||||
|
|
||||||
|
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||||
|
ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup();
|
||||||
|
|
||||||
|
// create the GenotypeLikelihoods object
|
||||||
|
DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform);
|
||||||
|
GL.add(pileup, true, UAC.CAP_BASE_QUALITY);
|
||||||
|
double[] likelihoods = GL.getLikelihoods();
|
||||||
|
double[] posteriors = GL.getPosteriors();
|
||||||
|
|
||||||
|
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
|
||||||
|
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
|
||||||
|
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele);
|
||||||
|
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
|
||||||
|
refAllele,
|
||||||
|
altAllele,
|
||||||
|
likelihoods[refGenotype.ordinal()],
|
||||||
|
likelihoods[hetGenotype.ordinal()],
|
||||||
|
likelihoods[homGenotype.ordinal()],
|
||||||
|
posteriors[refGenotype.ordinal()],
|
||||||
|
posteriors[hetGenotype.ordinal()],
|
||||||
|
posteriors[homGenotype.ordinal()]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return refAllele;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void initializeBestAlternateAllele(byte ref, Map<String, StratifiedAlignmentContext> contexts) {
|
||||||
|
int[] qualCounts = new int[4];
|
||||||
|
|
||||||
|
for ( Map.Entry<String, StratifiedAlignmentContext> sample : contexts.entrySet() ) {
|
||||||
|
// calculate the sum of quality scores for each base
|
||||||
|
ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup();
|
||||||
|
for ( PileupElement p : pileup ) {
|
||||||
|
// ignore deletions
|
||||||
|
if ( p.isDeletion() )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int index = BaseUtils.simpleBaseToBaseIndex(p.getBase());
|
||||||
|
if ( index >= 0 )
|
||||||
|
qualCounts[index] += p.getQual();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// set the non-ref base with maximum quality score sum
|
||||||
|
int maxCount = 0;
|
||||||
|
bestAlternateAllele = null;
|
||||||
|
for ( byte altAllele : BaseUtils.BASES ) {
|
||||||
|
if ( altAllele == ref )
|
||||||
|
continue;
|
||||||
|
int index = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
if ( qualCounts[index] > maxCount ) {
|
||||||
|
maxCount = qualCounts[index];
|
||||||
|
bestAlternateAllele = altAllele;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import static java.lang.Math.log10;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
public class ThreeStateErrorProbabilities extends FourBaseLikelihoods {
|
||||||
|
//
|
||||||
|
// forwarding constructors -- don't do anything at all
|
||||||
|
//
|
||||||
|
public ThreeStateErrorProbabilities() { super(); }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cloning of the object
|
||||||
|
* @return clone
|
||||||
|
* @throws CloneNotSupportedException
|
||||||
|
*/
|
||||||
|
protected Object clone() throws CloneNotSupportedException {
|
||||||
|
return super.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple log10(3) cached value
|
||||||
|
*/
|
||||||
|
protected static final double log103 = log10(3.0);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param observedBase observed base
|
||||||
|
* @param chromBase target base
|
||||||
|
* @param read SAM read
|
||||||
|
* @param offset offset on read
|
||||||
|
* @return log10 likelihood
|
||||||
|
*/
|
||||||
|
protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) {
|
||||||
|
return -log103; // equivalent to e / 3 model
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,125 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.Argument;
|
||||||
|
import org.broadinstitute.sting.commandline.Hidden;
|
||||||
|
|
||||||
|
|
||||||
|
public class UnifiedArgumentCollection {
|
||||||
|
|
||||||
|
// control the various models to be used
|
||||||
|
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false)
|
||||||
|
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||||
|
|
||||||
|
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
|
||||||
|
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||||
|
|
||||||
|
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||||
|
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||||
|
|
||||||
|
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||||
|
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||||
|
|
||||||
|
// control the output
|
||||||
|
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
|
||||||
|
public boolean GENOTYPE_MODE = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
|
||||||
|
public boolean ALL_BASES_MODE = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
|
||||||
|
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
|
||||||
|
|
||||||
|
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||||
|
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||||
|
|
||||||
|
@Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false)
|
||||||
|
public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0;
|
||||||
|
|
||||||
|
@Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
|
||||||
|
public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0;
|
||||||
|
|
||||||
|
@Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false)
|
||||||
|
public boolean NO_SLOD = false;
|
||||||
|
|
||||||
|
|
||||||
|
// control the error modes
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
|
||||||
|
public String ASSUME_SINGLE_SAMPLE = null;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||||
|
public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null;
|
||||||
|
|
||||||
|
|
||||||
|
// control the various parameters to be used
|
||||||
|
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||||
|
public int MIN_BASE_QUALTY_SCORE = 10;
|
||||||
|
|
||||||
|
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
|
||||||
|
public int MIN_MAPPING_QUALTY_SCORE = 10;
|
||||||
|
|
||||||
|
@Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false)
|
||||||
|
public int MAX_MISMATCHES = 3;
|
||||||
|
|
||||||
|
@Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false)
|
||||||
|
public boolean USE_BADLY_MATED_READS = false;
|
||||||
|
|
||||||
|
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||||
|
public Double MAX_DELETION_FRACTION = 0.05;
|
||||||
|
|
||||||
|
@Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false)
|
||||||
|
public boolean CAP_BASE_QUALITY = false;
|
||||||
|
|
||||||
|
|
||||||
|
public UnifiedArgumentCollection clone() {
|
||||||
|
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||||
|
|
||||||
|
uac.GLmodel = GLmodel;
|
||||||
|
uac.baseModel = baseModel;
|
||||||
|
uac.heterozygosity = heterozygosity;
|
||||||
|
uac.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||||
|
uac.ALL_BASES_MODE = ALL_BASES_MODE;
|
||||||
|
uac.NO_SLOD = NO_SLOD;
|
||||||
|
uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
|
||||||
|
uac.defaultPlatform = defaultPlatform;
|
||||||
|
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||||
|
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||||
|
uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING;
|
||||||
|
uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING;
|
||||||
|
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||||
|
uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE;
|
||||||
|
uac.MAX_MISMATCHES = MAX_MISMATCHES;
|
||||||
|
uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS;
|
||||||
|
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||||
|
|
||||||
|
return uac;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,477 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||||
|
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.pileup.*;
|
||||||
|
import org.broad.tribble.vcf.VCFConstants;
|
||||||
|
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
public class UnifiedGenotyperEngine {
|
||||||
|
|
||||||
|
public static final String TRIGGER_TRACK_NAME = "trigger";
|
||||||
|
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
|
||||||
|
|
||||||
|
// the unified argument collection
|
||||||
|
private UnifiedArgumentCollection UAC = null;
|
||||||
|
|
||||||
|
// the annotation engine
|
||||||
|
private VariantAnnotatorEngine annotationEngine;
|
||||||
|
|
||||||
|
// the model used for calculating genotypes
|
||||||
|
private ThreadLocal<GenotypeLikelihoodsCalculationModel> glcm = new ThreadLocal<GenotypeLikelihoodsCalculationModel>();
|
||||||
|
|
||||||
|
// the model used for calculating p(non-ref)
|
||||||
|
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||||
|
|
||||||
|
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||||
|
private double[] log10AlleleFrequencyPriors;
|
||||||
|
|
||||||
|
// the allele frequency likelihoods (allocated once as an optimization)
|
||||||
|
private ThreadLocal<double[]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[]>();
|
||||||
|
|
||||||
|
// the priors object
|
||||||
|
private GenotypePriors genotypePriors;
|
||||||
|
|
||||||
|
// the various loggers and writers
|
||||||
|
private Logger logger = null;
|
||||||
|
private PrintStream verboseWriter = null;
|
||||||
|
|
||||||
|
// number of chromosomes (2 * samples) in input
|
||||||
|
private int N;
|
||||||
|
|
||||||
|
// the standard filter to use for calls below the confidence threshold but above the emit threshold
|
||||||
|
private static final Set<String> filter = new HashSet<String>(1);
|
||||||
|
|
||||||
|
|
||||||
|
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||||
|
// get the number of samples
|
||||||
|
// if we're supposed to assume a single sample, do so
|
||||||
|
int numSamples;
|
||||||
|
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||||
|
numSamples = 1;
|
||||||
|
else
|
||||||
|
numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size();
|
||||||
|
initialize(UAC, null, null, null, numSamples);
|
||||||
|
}
|
||||||
|
|
||||||
|
public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||||
|
initialize(UAC, logger, verboseWriter, engine, numSamples);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) {
|
||||||
|
this.UAC = UAC;
|
||||||
|
this.logger = logger;
|
||||||
|
this.verboseWriter = verboseWriter;
|
||||||
|
this.annotationEngine = engine;
|
||||||
|
|
||||||
|
N = 2 * numSamples;
|
||||||
|
log10AlleleFrequencyPriors = new double[N+1];
|
||||||
|
computeAlleleFrequencyPriors(N);
|
||||||
|
genotypePriors = createGenotypePriors(UAC);
|
||||||
|
|
||||||
|
filter.add(LOW_QUAL_FILTER_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute at a given locus.
|
||||||
|
*
|
||||||
|
* @param tracker the meta data tracker
|
||||||
|
* @param refContext the reference base
|
||||||
|
* @param rawContext contextual information around the locus
|
||||||
|
* @return the VariantCallContext object
|
||||||
|
*/
|
||||||
|
public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||||
|
|
||||||
|
// initialize the GenotypeCalculationModel for this thread if that hasn't been done yet
|
||||||
|
if ( glcm.get() == null ) {
|
||||||
|
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
|
||||||
|
log10AlleleFrequencyPosteriors.set(new double[N+1]);
|
||||||
|
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||||
|
}
|
||||||
|
|
||||||
|
BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext);
|
||||||
|
Map<String, StratifiedAlignmentContext> stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter);
|
||||||
|
if ( stratifiedContexts == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
|
||||||
|
Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, genotypePriors, GLs);
|
||||||
|
|
||||||
|
// estimate our confidence in a reference call and return
|
||||||
|
if ( GLs.size() == 0 )
|
||||||
|
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false);
|
||||||
|
|
||||||
|
// reset the optimization value and determine the p(AF>0)
|
||||||
|
// TODO: get rid of this optimization, it is wrong!
|
||||||
|
afcm.get().setMinAlleleFrequencyToTest(0);
|
||||||
|
|
||||||
|
// zero out the AFs above the N for this position
|
||||||
|
ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors.get(), 2 * GLs.size());
|
||||||
|
afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get());
|
||||||
|
|
||||||
|
// find the most likely frequency
|
||||||
|
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
|
||||||
|
|
||||||
|
// calculate p(f>0)
|
||||||
|
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get());
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i <= N; i++)
|
||||||
|
sum += normalizedPosteriors[i];
|
||||||
|
double PofF = Math.min(sum, 1.0); // deal with precision errors
|
||||||
|
|
||||||
|
double phredScaledConfidence;
|
||||||
|
if ( bestAFguess != 0 ) {
|
||||||
|
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||||
|
if ( Double.isInfinite(phredScaledConfidence) )
|
||||||
|
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0];
|
||||||
|
} else {
|
||||||
|
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||||
|
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||||
|
sum = 0.0;
|
||||||
|
for (int i = 1; i <= N; i++) {
|
||||||
|
if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||||
|
break;
|
||||||
|
sum += log10AlleleFrequencyPosteriors.get()[i];
|
||||||
|
}
|
||||||
|
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// did we trigger on the provided track?
|
||||||
|
boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0;
|
||||||
|
|
||||||
|
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||||
|
if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) {
|
||||||
|
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||||
|
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||||
|
return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the genotypes
|
||||||
|
Map<String, Genotype> genotypes = afcm.get().assignGenotypes(stratifiedContexts, GLs, log10AlleleFrequencyPosteriors.get(), bestAFguess);
|
||||||
|
|
||||||
|
// next, get the variant context data (alleles, attributes, etc.)
|
||||||
|
HashSet<Allele> alleles = new HashSet<Allele>();
|
||||||
|
alleles.add(refAllele);
|
||||||
|
for ( Genotype g : genotypes.values() )
|
||||||
|
alleles.addAll(g.getAlleles());
|
||||||
|
|
||||||
|
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
DbSNPFeature dbsnp = getDbSNP(tracker);
|
||||||
|
if ( dbsnp != null )
|
||||||
|
attributes.put(VariantContext.ID_KEY, dbsnp.getRsID());
|
||||||
|
|
||||||
|
// if the site was downsampled, record that fact
|
||||||
|
if ( rawContext.hasPileupBeenDownsampled() )
|
||||||
|
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
|
||||||
|
|
||||||
|
|
||||||
|
if ( !UAC.NO_SLOD ) {
|
||||||
|
|
||||||
|
// TODO: implement me
|
||||||
|
|
||||||
|
//Map<String, BiallelicGenotypeLikelihoods> forwardGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD, genotypePriors);
|
||||||
|
//Map<String, BiallelicGenotypeLikelihoods> reverseGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE, genotypePriors);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc loc = refContext.getLocus();
|
||||||
|
VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes);
|
||||||
|
|
||||||
|
if ( annotationEngine != null ) {
|
||||||
|
// first off, we want to use the *unfiltered* context for the annotations
|
||||||
|
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
|
||||||
|
|
||||||
|
Collection<VariantContext> variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc);
|
||||||
|
vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element.
|
||||||
|
}
|
||||||
|
|
||||||
|
// print out stats if we have a writer
|
||||||
|
if ( verboseWriter != null )
|
||||||
|
printVerboseData(vc, PofF, phredScaledConfidence, normalizedPosteriors);
|
||||||
|
|
||||||
|
VariantCallContext call = new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence, atTriggerTrack));
|
||||||
|
call.setRefBase(refContext.getBase());
|
||||||
|
return call;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isValidDeletionFraction(double d) {
|
||||||
|
return ( d >= 0.0 && d <= 1.0 );
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, StratifiedAlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) {
|
||||||
|
Map<String, StratifiedAlignmentContext> stratifiedContexts = null;
|
||||||
|
|
||||||
|
if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) {
|
||||||
|
|
||||||
|
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
|
||||||
|
|
||||||
|
// filter the context based on min mapping quality
|
||||||
|
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||||
|
|
||||||
|
// filter the context based on bad mates and mismatch rate
|
||||||
|
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||||
|
|
||||||
|
// don't call when there is no coverage
|
||||||
|
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// stratify the AlignmentContext and cut by sample
|
||||||
|
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||||
|
|
||||||
|
} else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) {
|
||||||
|
|
||||||
|
byte ref = refContext.getBase();
|
||||||
|
if ( !BaseUtils.isRegularBase(ref) )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
ReadBackedPileup rawPileup = rawContext.getBasePileup();
|
||||||
|
|
||||||
|
// filter the context based on min base and mapping qualities
|
||||||
|
ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE);
|
||||||
|
|
||||||
|
// filter the context based on bad mates and mismatch rate
|
||||||
|
pileup = pileup.getFilteredPileup(badReadPileupFilter);
|
||||||
|
|
||||||
|
// don't call when there is no coverage
|
||||||
|
if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// are there too many deletions in the pileup?
|
||||||
|
if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) &&
|
||||||
|
(double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// stratify the AlignmentContext and cut by sample
|
||||||
|
stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return stratifiedContexts;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param AFs AF array
|
||||||
|
* @param freqI allele frequency I
|
||||||
|
*/
|
||||||
|
protected static void ignoreAlleleFrequenciesAboveI(double[] AFs, int freqI) {
|
||||||
|
while ( ++freqI < AFs.length )
|
||||||
|
AFs[freqI] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||||
|
}
|
||||||
|
|
||||||
|
private VariantCallContext estimateReferenceConfidence(Map<String, StratifiedAlignmentContext> contexts, double theta, boolean ignoreCoveredSamples) {
|
||||||
|
|
||||||
|
// TODO: implement me
|
||||||
|
|
||||||
|
double P_of_ref = 1.0;
|
||||||
|
|
||||||
|
// use the AF=0 prob if it's calculated
|
||||||
|
//if ( ignoreCoveredSamples )
|
||||||
|
// P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)];
|
||||||
|
|
||||||
|
// for each sample that we haven't examined yet
|
||||||
|
//for ( String sample : samples ) {
|
||||||
|
// boolean isCovered = contexts.containsKey(sample);
|
||||||
|
// if ( ignoreCoveredSamples && isCovered )
|
||||||
|
// continue;
|
||||||
|
|
||||||
|
P_of_ref = 0.5;
|
||||||
|
// int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0;
|
||||||
|
// P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5);
|
||||||
|
//}
|
||||||
|
|
||||||
|
return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void printVerboseData(VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) {
|
||||||
|
for (int i = 0; i <= N; i++) {
|
||||||
|
StringBuilder AFline = new StringBuilder("AFINFO\t");
|
||||||
|
AFline.append(vc.getChr()).append(":").append(vc.getStart()).append("\t");
|
||||||
|
AFline.append(vc.getReference()).append("\t");
|
||||||
|
if ( vc.isBiallelic() )
|
||||||
|
AFline.append(vc.getAlternateAllele(0)).append("\t");
|
||||||
|
else
|
||||||
|
AFline.append("N/A\t");
|
||||||
|
AFline.append(i + "/" + N + "\t");
|
||||||
|
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||||
|
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i]));
|
||||||
|
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
|
||||||
|
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
|
||||||
|
verboseWriter.println(AFline.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
verboseWriter.println("P(f>0) = " + PofF);
|
||||||
|
verboseWriter.println("Qscore = " + phredScaledConfidence);
|
||||||
|
verboseWriter.println();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filters low quality reads out of the pileup.
|
||||||
|
*/
|
||||||
|
private class BadReadPileupFilter implements PileupElementFilter {
|
||||||
|
private ReferenceContext refContext;
|
||||||
|
|
||||||
|
public BadReadPileupFilter(ReferenceContext ref) {
|
||||||
|
// create the +/-20bp window
|
||||||
|
GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20));
|
||||||
|
byte[] bases = new byte[41];
|
||||||
|
System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size());
|
||||||
|
refContext = new ReferenceContext(ref.getLocus(), window, bases);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean allow(PileupElement pileupElement) {
|
||||||
|
return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) &&
|
||||||
|
AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param tracker rod data
|
||||||
|
*
|
||||||
|
* @return the dbsnp rod if there is one at this position
|
||||||
|
*/
|
||||||
|
protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) {
|
||||||
|
return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) {
|
||||||
|
return (atTriggerTrack ?
|
||||||
|
(conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) :
|
||||||
|
((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING)));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) {
|
||||||
|
return (atTriggerTrack ?
|
||||||
|
(conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) :
|
||||||
|
(conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void computeAlleleFrequencyPriors(int N) {
|
||||||
|
// calculate sum(1/i)
|
||||||
|
double sigma_1_over_I = 0.0;
|
||||||
|
for (int i = 1; i <= N; i++)
|
||||||
|
sigma_1_over_I += 1.0 / (double)i;
|
||||||
|
|
||||||
|
// delta = theta / sum(1/i)
|
||||||
|
double delta = UAC.heterozygosity / sigma_1_over_I;
|
||||||
|
|
||||||
|
// calculate the null allele frequencies for 1-N
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i <= N; i++) {
|
||||||
|
double value = delta / (double)i;
|
||||||
|
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||||
|
sum += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||||
|
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: enable me
|
||||||
|
protected void computeAlleleFrequencyPriorsCorrect(int N) {
|
||||||
|
// calculate the allele frequency priors for 1-N
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i <= N; i++) {
|
||||||
|
double value = UAC.heterozygosity / (double)i;
|
||||||
|
log10AlleleFrequencyPriors[i] = Math.log10(value);
|
||||||
|
sum += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||||
|
log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) {
|
||||||
|
GenotypePriors priors;
|
||||||
|
switch ( UAC.GLmodel ) {
|
||||||
|
case SNP:
|
||||||
|
// use flat priors for GLs
|
||||||
|
priors = new DiploidSNPGenotypePriors();
|
||||||
|
break;
|
||||||
|
case DINDEL:
|
||||||
|
// TODO: create indel priors object
|
||||||
|
priors = null;
|
||||||
|
break;
|
||||||
|
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return priors;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) {
|
||||||
|
GenotypeLikelihoodsCalculationModel glcm;
|
||||||
|
switch ( UAC.GLmodel ) {
|
||||||
|
case SNP:
|
||||||
|
glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||||
|
break;
|
||||||
|
case DINDEL:
|
||||||
|
glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger);
|
||||||
|
break;
|
||||||
|
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return glcm;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||||
|
AlleleFrequencyCalculationModel afcm;
|
||||||
|
switch ( UAC.AFmodel ) {
|
||||||
|
case EXACT:
|
||||||
|
afcm = new ExactAFCalculationModel(N, logger, verboseWriter);
|
||||||
|
break;
|
||||||
|
case GRID_SEARCH:
|
||||||
|
afcm = new GridSearchAFEstimation(N, logger, verboseWriter);
|
||||||
|
break;
|
||||||
|
default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return afcm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,241 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broad.tribble.vcf.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||||
|
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.commandline.*;
|
||||||
|
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
|
||||||
|
* multi-sample data. The user can choose from several different incorporated calculation models.
|
||||||
|
*/
|
||||||
|
@Reference(window=@Window(start=-200,stop=200))
|
||||||
|
@By(DataSource.REFERENCE)
|
||||||
|
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||||
|
public class UnifiedGenotyperV2 extends LocusWalker<VariantCallContext, UnifiedGenotyperV2.UGStatistics> implements TreeReducible<UnifiedGenotyperV2.UGStatistics> {
|
||||||
|
|
||||||
|
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||||
|
|
||||||
|
// control the output
|
||||||
|
@Output(doc="File to which variants should be written",required=true)
|
||||||
|
protected VCFWriter writer = null;
|
||||||
|
|
||||||
|
@Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false)
|
||||||
|
@Deprecated
|
||||||
|
protected String varout;
|
||||||
|
|
||||||
|
@Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
|
||||||
|
protected PrintStream verboseWriter = null;
|
||||||
|
|
||||||
|
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
|
||||||
|
protected PrintStream metricsWriter = null;
|
||||||
|
|
||||||
|
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||||
|
protected List<String> annotationsToUse = new ArrayList<String>();
|
||||||
|
|
||||||
|
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||||
|
protected String[] annotationClassesToUse = { "Standard" };
|
||||||
|
|
||||||
|
// the calculation arguments
|
||||||
|
private UnifiedGenotyperEngine UG_engine = null;
|
||||||
|
|
||||||
|
// the annotation engine
|
||||||
|
private VariantAnnotatorEngine annotationEngine;
|
||||||
|
|
||||||
|
// samples in input
|
||||||
|
private Set<String> samples = new TreeSet<String>();
|
||||||
|
|
||||||
|
// enable deletions in the pileup
|
||||||
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
// enable extended events for indels
|
||||||
|
public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Inner class for collecting output statistics from the UG
|
||||||
|
*/
|
||||||
|
public static class UGStatistics {
|
||||||
|
/** The total number of passes examined -- i.e., the number of map calls */
|
||||||
|
long nBasesVisited = 0;
|
||||||
|
|
||||||
|
/** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */
|
||||||
|
long nBasesCallable = 0;
|
||||||
|
|
||||||
|
/** The number of bases called confidently (according to user threshold), either ref or other */
|
||||||
|
long nBasesCalledConfidently = 0;
|
||||||
|
|
||||||
|
/** The number of bases for which calls were emitted */
|
||||||
|
long nCallsMade = 0;
|
||||||
|
|
||||||
|
/** The total number of extended events encountered */
|
||||||
|
long nExtendedEvents = 0;
|
||||||
|
|
||||||
|
double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); }
|
||||||
|
double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||||
|
double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the samples, output, and genotype calculation model
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public void initialize() {
|
||||||
|
// get all of the unique sample names
|
||||||
|
// if we're supposed to assume a single sample, do so
|
||||||
|
if ( UAC.ASSUME_SINGLE_SAMPLE != null )
|
||||||
|
samples.add(UAC.ASSUME_SINGLE_SAMPLE);
|
||||||
|
else
|
||||||
|
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||||
|
|
||||||
|
// initialize the verbose writer
|
||||||
|
if ( verboseWriter != null )
|
||||||
|
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
|
||||||
|
|
||||||
|
annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse);
|
||||||
|
UG_engine = new UnifiedGenotyperEngine(UAC, logger, verboseWriter, annotationEngine, samples.size());
|
||||||
|
|
||||||
|
// initialize the header
|
||||||
|
writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<VCFHeaderLine> getHeaderInfo() {
|
||||||
|
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
|
||||||
|
// all annotation fields from VariantAnnotatorEngine
|
||||||
|
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||||
|
|
||||||
|
// annotation (INFO) fields from UnifiedGenotyper
|
||||||
|
if ( !UAC.NO_SLOD )
|
||||||
|
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias"));
|
||||||
|
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?"));
|
||||||
|
|
||||||
|
// also, check to see whether comp rods were included
|
||||||
|
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
|
||||||
|
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||||
|
if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||||
|
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership"));
|
||||||
|
}
|
||||||
|
else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) {
|
||||||
|
String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length());
|
||||||
|
headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FORMAT and INFO fields
|
||||||
|
headerInfo.addAll(VCFUtils.getSupportedHeaderStrings());
|
||||||
|
|
||||||
|
// FILTER fields
|
||||||
|
if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING ||
|
||||||
|
UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING )
|
||||||
|
headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
|
||||||
|
|
||||||
|
return headerInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute at a given locus.
|
||||||
|
*
|
||||||
|
* @param tracker the meta data tracker
|
||||||
|
* @param refContext the reference base
|
||||||
|
* @param rawContext contextual information around the locus
|
||||||
|
* @return the VariantCallContext object
|
||||||
|
*/
|
||||||
|
public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
|
||||||
|
return UG_engine.runGenotyper(tracker, refContext, rawContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
public UGStatistics reduceInit() { return new UGStatistics(); }
|
||||||
|
|
||||||
|
public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) {
|
||||||
|
lhs.nBasesCallable += rhs.nBasesCallable;
|
||||||
|
lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently;
|
||||||
|
lhs.nBasesVisited += rhs.nBasesVisited;
|
||||||
|
lhs.nCallsMade += rhs.nCallsMade;
|
||||||
|
return lhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public UGStatistics reduce(VariantCallContext value, UGStatistics sum) {
|
||||||
|
// we get a point for reaching reduce
|
||||||
|
sum.nBasesVisited++;
|
||||||
|
|
||||||
|
// can't call the locus because of no coverage
|
||||||
|
if ( value == null )
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
// A call was attempted -- the base was potentially callable
|
||||||
|
sum.nBasesCallable++;
|
||||||
|
|
||||||
|
// the base was confidently callable
|
||||||
|
sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0;
|
||||||
|
|
||||||
|
// can't make a confident variant call here
|
||||||
|
if ( value.vc == null )
|
||||||
|
return sum;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// we are actually making a call
|
||||||
|
sum.nCallsMade++;
|
||||||
|
writer.add(value.vc, value.refBase);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name");
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(UGStatistics sum) {
|
||||||
|
logger.info(String.format("Visited bases %d", sum.nBasesVisited));
|
||||||
|
logger.info(String.format("Callable bases %d", sum.nBasesCallable));
|
||||||
|
logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||||
|
logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||||
|
logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||||
|
logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||||
|
logger.info(String.format("Actual calls made %d", sum.nCallsMade));
|
||||||
|
|
||||||
|
if ( metricsWriter != null ) {
|
||||||
|
metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited));
|
||||||
|
metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable));
|
||||||
|
metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently));
|
||||||
|
metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll()));
|
||||||
|
metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll()));
|
||||||
|
metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable()));
|
||||||
|
metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||||
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: depristo, ebanks
|
||||||
|
* Date: Jan 22, 2010
|
||||||
|
* Time: 2:25:19 PM
|
||||||
|
*
|
||||||
|
* Useful helper class to communicate the results of calculateGenotype to framework
|
||||||
|
*/
|
||||||
|
public class VariantCallContext {
|
||||||
|
public VariantContext vc = null;
|
||||||
|
public byte refBase;
|
||||||
|
|
||||||
|
// Was the site called confidently, either reference or variant?
|
||||||
|
public boolean confidentlyCalled = false;
|
||||||
|
|
||||||
|
VariantCallContext(VariantContext vc, boolean confidentlyCalledP) {
|
||||||
|
this.vc = vc;
|
||||||
|
this.confidentlyCalled = confidentlyCalledP;
|
||||||
|
}
|
||||||
|
|
||||||
|
VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) {
|
||||||
|
this.vc = vc;
|
||||||
|
this.refBase = ref;
|
||||||
|
this.confidentlyCalled = confidentlyCalledP;
|
||||||
|
}
|
||||||
|
|
||||||
|
// blank variant context => we're a ref site
|
||||||
|
VariantCallContext(boolean confidentlyCalledP) {
|
||||||
|
this.confidentlyCalled = confidentlyCalledP;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRefBase(byte ref) {
|
||||||
|
this.refBase = ref;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue