Now that G's new genotyping algorithm is live, I've cleaned up the code to completely separate the grid search from the exact model. AlleleFrequencyCalculationModel is now completely abstract.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4517 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
80e5ac65b4
commit
d54d9880d7
|
|
@ -29,15 +29,9 @@ import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
|
||||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
|
||||||
import org.broad.tribble.util.variantcontext.Genotype;
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
import org.broad.tribble.util.variantcontext.Allele;
|
|
||||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
|
||||||
import org.broad.tribble.vcf.VCFConstants;
|
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
@ -54,24 +48,30 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
GRID_SEARCH
|
GRID_SEARCH
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final double LOG10_REFERENCE_CALL_EPSILON = 1.0;
|
|
||||||
|
|
||||||
protected int N;
|
protected int N;
|
||||||
protected AlleleFrequencyMatrix AFMatrix;
|
|
||||||
protected Set<BiallelicGenotypeLikelihoods> refCalls;
|
|
||||||
|
|
||||||
protected Logger logger;
|
protected Logger logger;
|
||||||
protected PrintStream verboseWriter;
|
protected PrintStream verboseWriter;
|
||||||
|
|
||||||
protected boolean useReferenceSampleOptimization;
|
protected enum GenotypeType { AA, AB, BB }
|
||||||
|
|
||||||
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter, boolean useReferenceSampleOptimization) {
|
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||||
|
|
||||||
|
protected class CalculatedAlleleFrequency {
|
||||||
|
|
||||||
|
public double log10PNonRef;
|
||||||
|
public int altAlleles;
|
||||||
|
|
||||||
|
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
|
||||||
|
this.log10PNonRef = log10PNonRef;
|
||||||
|
this.altAlleles = altAlleles;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
this.N = N;
|
this.N = N;
|
||||||
this.logger = logger;
|
this.logger = logger;
|
||||||
this.verboseWriter = verboseWriter;
|
this.verboseWriter = verboseWriter;
|
||||||
this.useReferenceSampleOptimization = useReferenceSampleOptimization;
|
|
||||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
|
||||||
refCalls = new HashSet<BiallelicGenotypeLikelihoods>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -83,7 +83,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||||
* @param minFrequencyToCalculate the minimum frequency which needs to be calculated
|
* @param minFrequencyToCalculate the minimum frequency which needs to be calculated
|
||||||
*/
|
*/
|
||||||
public abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
protected abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
ReferenceContext ref,
|
ReferenceContext ref,
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
double[] log10AlleleFrequencyPriors,
|
double[] log10AlleleFrequencyPriors,
|
||||||
|
|
@ -99,101 +99,12 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
*
|
*
|
||||||
* @return calls
|
* @return calls
|
||||||
*/
|
*/
|
||||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
protected abstract Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int AFofMaxLikelihood) {
|
int AFofMaxLikelihood);
|
||||||
initializeAFMatrix(GLs);
|
|
||||||
|
|
||||||
// increment the grid
|
protected int getUnfilteredDepth(ReadBackedPileup pileup) {
|
||||||
for (int i = 1; i <= AFofMaxLikelihood; i++) {
|
|
||||||
// add one more alternate allele
|
|
||||||
AFMatrix.incrementFrequency();
|
|
||||||
}
|
|
||||||
|
|
||||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
|
||||||
int frequency) {
|
|
||||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
|
||||||
|
|
||||||
// first, the potential alt calls
|
|
||||||
for ( String sample : AFMatrix.getSamples() ) {
|
|
||||||
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
|
||||||
Allele alleleA = GL.getAlleleA();
|
|
||||||
Allele alleleB = GL.getAlleleB();
|
|
||||||
|
|
||||||
// set the genotype and confidence
|
|
||||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(frequency, sample);
|
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
|
||||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
|
||||||
myAlleles.add(alleleA);
|
|
||||||
myAlleles.add(alleleA);
|
|
||||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
|
||||||
myAlleles.add(alleleA);
|
|
||||||
myAlleles.add(alleleB);
|
|
||||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
|
||||||
myAlleles.add(alleleB);
|
|
||||||
myAlleles.add(alleleB);
|
|
||||||
}
|
|
||||||
|
|
||||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
|
||||||
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
|
||||||
|
|
||||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
|
||||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
|
||||||
|
|
||||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
|
||||||
}
|
|
||||||
|
|
||||||
// now, the clearly ref calls
|
|
||||||
for ( BiallelicGenotypeLikelihoods GL : refCalls ) {
|
|
||||||
String sample = GL.getSample();
|
|
||||||
|
|
||||||
Allele ref = GL.getAlleleA();
|
|
||||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
|
||||||
myAlleles.add(ref);
|
|
||||||
myAlleles.add(ref);
|
|
||||||
|
|
||||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
|
||||||
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
|
||||||
|
|
||||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
|
||||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
|
||||||
|
|
||||||
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
|
|
||||||
|
|
||||||
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
|
|
||||||
}
|
|
||||||
|
|
||||||
return calls;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
|
||||||
refCalls.clear();
|
|
||||||
AFMatrix.clear();
|
|
||||||
|
|
||||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
|
||||||
if ( useReferenceSampleOptimization && isClearRefCall(GL) ) {
|
|
||||||
refCalls.add(GL);
|
|
||||||
} else {
|
|
||||||
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) {
|
|
||||||
if ( GL.getAlleleA().isNonReference() )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
double[] likelihoods = GL.getLikelihoods();
|
|
||||||
double refLikelihoodMinusEpsilon = likelihoods[0] - LOG10_REFERENCE_CALL_EPSILON;
|
|
||||||
return ( refLikelihoodMinusEpsilon > likelihoods[1] && refLikelihoodMinusEpsilon > likelihoods[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getUnfilteredDepth(ReadBackedPileup pileup) {
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for ( PileupElement p : pileup ) {
|
for ( PileupElement p : pileup ) {
|
||||||
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
|
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
|
||||||
|
|
@ -202,169 +113,4 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected class CalculatedAlleleFrequency {
|
|
||||||
|
|
||||||
public double log10PNonRef;
|
|
||||||
public int altAlleles;
|
|
||||||
|
|
||||||
public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) {
|
|
||||||
this.log10PNonRef = log10PNonRef;
|
|
||||||
this.altAlleles = altAlleles;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected enum GenotypeType { AA, AB, BB }
|
|
||||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
|
||||||
|
|
||||||
protected static class AlleleFrequencyMatrix {
|
|
||||||
|
|
||||||
private double[][] matrix; // allele frequency matrix
|
|
||||||
private int[] indexes; // matrix to maintain which genotype is active
|
|
||||||
private int maxN; // total possible frequencies in data
|
|
||||||
private int frequency; // current frequency
|
|
||||||
|
|
||||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
|
||||||
private ArrayList<String> samples = new ArrayList<String>();
|
|
||||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
|
||||||
|
|
||||||
public AlleleFrequencyMatrix(int N) {
|
|
||||||
maxN = N;
|
|
||||||
matrix = new double[N][3];
|
|
||||||
indexes = new int[N];
|
|
||||||
clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getSamples() { return samples; }
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
frequency = 0;
|
|
||||||
for (int i = 0; i < maxN; i++)
|
|
||||||
indexes[i] = 0;
|
|
||||||
samples.clear();
|
|
||||||
samplesToGenotypesPerAF.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
|
||||||
int index = samples.size();
|
|
||||||
samples.add(sample);
|
|
||||||
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
|
||||||
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
|
||||||
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLikelihoods(double[] GLs, String sample) {
|
|
||||||
int index = samples.size();
|
|
||||||
samples.add(sample);
|
|
||||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
|
||||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
|
||||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
public void incrementFrequency() {
|
|
||||||
int N = samples.size();
|
|
||||||
if ( frequency == 2 * N )
|
|
||||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
|
||||||
frequency++;
|
|
||||||
|
|
||||||
double greedy = VALUE_NOT_CALCULATED;
|
|
||||||
int greedyIndex = -1;
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
|
|
||||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
|
||||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
|
||||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
|
||||||
greedyIndex = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
|
||||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
|
||||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
|
||||||
greedyIndex = i;
|
|
||||||
}
|
|
||||||
// note that we currently don't bother with breaking ties between samples
|
|
||||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
|
||||||
// unlikely that a collision will both occur and that the difference will
|
|
||||||
// be significant at HOM_VAR...
|
|
||||||
}
|
|
||||||
// if this person is already hom var, he can't add another alternate allele
|
|
||||||
// so we can ignore that case
|
|
||||||
}
|
|
||||||
if ( greedyIndex == -1 )
|
|
||||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
|
||||||
|
|
||||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
|
||||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
|
||||||
else
|
|
||||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
|
||||||
}
|
|
||||||
|
|
||||||
public double getLikelihoodsOfFrequency() {
|
|
||||||
double likelihoods = 0.0;
|
|
||||||
int N = samples.size();
|
|
||||||
for (int i = 0; i < N; i++)
|
|
||||||
likelihoods += matrix[i][indexes[i]];
|
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println(frequency);
|
|
||||||
for (int i = 0; i < N; i++) {
|
|
||||||
for (int j=0; j < 3; j++) {
|
|
||||||
System.out.print(String.valueOf(matrix[i][j]));
|
|
||||||
System.out.print(indexes[i] == j ? "* " : " ");
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
System.out.println(likelihoods);
|
|
||||||
System.out.println();
|
|
||||||
*/
|
|
||||||
|
|
||||||
recordGenotypes();
|
|
||||||
|
|
||||||
return likelihoods;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
|
||||||
// it's possible that the genotype conformation hash wasn't initialized
|
|
||||||
if ( samplesToGenotypesPerAF.get(frequency) == null ) {
|
|
||||||
// confirm that the current frequency is the target one
|
|
||||||
if ( frequency != this.frequency )
|
|
||||||
throw new IllegalStateException("Attempting to retrieve genotypes from an uninitialized hash for frequency " + frequency + " (the hash is current standing at frequency " + this.frequency + ")");
|
|
||||||
recordGenotypes();
|
|
||||||
}
|
|
||||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void recordGenotypes() {
|
|
||||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
|
||||||
|
|
||||||
int index = 0;
|
|
||||||
for ( String sample : samples ) {
|
|
||||||
int genotype = indexes[index];
|
|
||||||
|
|
||||||
double score;
|
|
||||||
|
|
||||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
|
||||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
|
||||||
if ( genotype == maxEntry ) {
|
|
||||||
if ( genotype == GenotypeType.AA.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
|
||||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
|
||||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
|
||||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
|
||||||
}
|
|
||||||
// otherwise, we need to calculate the probability of the genotype
|
|
||||||
else {
|
|
||||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
|
||||||
double chosenGenotype = normalized[genotype];
|
|
||||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
|
||||||
}
|
|
||||||
|
|
||||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
@ -46,7 +46,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
private static final double LOGEPS = -300;
|
private static final double LOGEPS = -300;
|
||||||
|
|
||||||
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
super(N, logger, verboseWriter, true);
|
super(N, logger, verboseWriter);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
|
|
@ -151,25 +151,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int AFofMaxLikelihood) {
|
int AFofMaxLikelihood) {
|
||||||
|
|
||||||
// overriding implementation in AlleleFrequencyCalculationModel to avoid filling out AF matrix which is not used.
|
|
||||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
|
||||||
int bestAFguess) {
|
|
||||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||||
|
|
||||||
|
|
||||||
double[][] pathMetricArray = new double[GLs.size()+1][bestAFguess+1];
|
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||||
int[][] tracebackArray = new int[GLs.size()+1][bestAFguess+1];
|
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||||
|
|
||||||
ArrayList<String> sampleIndices = new ArrayList<String>();
|
ArrayList<String> sampleIndices = new ArrayList<String>();
|
||||||
int sampleIdx = 0;
|
int sampleIdx = 0;
|
||||||
|
|
||||||
// todo - optimize initialization
|
// todo - optimize initialization
|
||||||
for (int k=0; k <= bestAFguess; k++)
|
for (int k=0; k <= AFofMaxLikelihood; k++)
|
||||||
for (int j=0; j <= GLs.size(); j++)
|
for (int j=0; j <= GLs.size(); j++)
|
||||||
pathMetricArray[j][k] = -1e30;
|
pathMetricArray[j][k] = -1e30;
|
||||||
|
|
||||||
|
|
@ -186,7 +178,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
double[] likelihoods = GLs.get(sample).getLikelihoods();
|
double[] likelihoods = GLs.get(sample).getLikelihoods();
|
||||||
|
|
||||||
for (int k=0; k <= bestAFguess; k++) {
|
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||||
|
|
||||||
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
||||||
int bestIndex = k;
|
int bestIndex = k;
|
||||||
|
|
@ -214,7 +206,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int startIdx = bestAFguess;
|
int startIdx = AFofMaxLikelihood;
|
||||||
for (int k = sampleIdx; k > 0; k--) {
|
for (int k = sampleIdx; k > 0; k--) {
|
||||||
int bestGTguess;
|
int bestGTguess;
|
||||||
String sample = sampleIndices.get(k-1);
|
String sample = sampleIndices.get(k-1);
|
||||||
|
|
@ -263,7 +255,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return calls;
|
return calls;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,9 +27,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broad.tribble.util.variantcontext.Genotype;
|
import org.broad.tribble.util.variantcontext.Genotype;
|
||||||
|
import org.broad.tribble.util.variantcontext.Allele;
|
||||||
|
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||||
|
import org.broad.tribble.vcf.VCFConstants;
|
||||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -40,17 +46,19 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||||
|
|
||||||
|
private AlleleFrequencyMatrix AFMatrix;
|
||||||
|
|
||||||
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
|
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
|
||||||
super(N, logger, verboseWriter, false);
|
super(N, logger, verboseWriter);
|
||||||
|
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
protected void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||||
ReferenceContext ref,
|
ReferenceContext ref,
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
double[] log10AlleleFrequencyPriors,
|
double[] log10AlleleFrequencyPriors,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int minFrequencyToCalculate) {
|
int minFrequencyToCalculate) {
|
||||||
|
|
||||||
initializeAFMatrix(GLs);
|
initializeAFMatrix(GLs);
|
||||||
|
|
||||||
// first, calculate for AF=0 (no change to matrix)
|
// first, calculate for AF=0 (no change to matrix)
|
||||||
|
|
@ -90,10 +98,192 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
||||||
*
|
*
|
||||||
* @return calls
|
* @return calls
|
||||||
*/
|
*/
|
||||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
protected Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||||
double[] log10AlleleFrequencyPosteriors,
|
double[] log10AlleleFrequencyPosteriors,
|
||||||
int AFofMaxLikelihood) {
|
int AFofMaxLikelihood) {
|
||||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||||
|
|
||||||
|
// first, the potential alt calls
|
||||||
|
for ( String sample : AFMatrix.getSamples() ) {
|
||||||
|
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||||
|
Allele alleleA = GL.getAlleleA();
|
||||||
|
Allele alleleB = GL.getAlleleB();
|
||||||
|
|
||||||
|
// set the genotype and confidence
|
||||||
|
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
|
||||||
|
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||||
|
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||||
|
myAlleles.add(alleleA);
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
myAlleles.add(alleleB);
|
||||||
|
}
|
||||||
|
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
||||||
|
|
||||||
|
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||||
|
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||||
|
|
||||||
|
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
return calls;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||||
|
AFMatrix.clear();
|
||||||
|
|
||||||
|
for ( BiallelicGenotypeLikelihoods GL : GLs.values() )
|
||||||
|
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static class AlleleFrequencyMatrix {
|
||||||
|
|
||||||
|
private double[][] matrix; // allele frequency matrix
|
||||||
|
private int[] indexes; // matrix to maintain which genotype is active
|
||||||
|
private int maxN; // total possible frequencies in data
|
||||||
|
private int frequency; // current frequency
|
||||||
|
|
||||||
|
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||||
|
private ArrayList<String> samples = new ArrayList<String>();
|
||||||
|
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||||
|
|
||||||
|
public AlleleFrequencyMatrix(int N) {
|
||||||
|
maxN = N;
|
||||||
|
matrix = new double[N][3];
|
||||||
|
indexes = new int[N];
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSamples() { return samples; }
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
frequency = 0;
|
||||||
|
for (int i = 0; i < maxN; i++)
|
||||||
|
indexes[i] = 0;
|
||||||
|
samples.clear();
|
||||||
|
samplesToGenotypesPerAF.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||||
|
int index = samples.size();
|
||||||
|
samples.add(sample);
|
||||||
|
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
||||||
|
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
||||||
|
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLikelihoods(double[] GLs, String sample) {
|
||||||
|
int index = samples.size();
|
||||||
|
samples.add(sample);
|
||||||
|
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||||
|
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||||
|
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementFrequency() {
|
||||||
|
int N = samples.size();
|
||||||
|
if ( frequency == 2 * N )
|
||||||
|
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||||
|
frequency++;
|
||||||
|
|
||||||
|
double greedy = VALUE_NOT_CALCULATED;
|
||||||
|
int greedyIndex = -1;
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
|
||||||
|
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||||
|
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||||
|
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||||
|
greedyIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||||
|
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||||
|
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||||
|
greedyIndex = i;
|
||||||
|
}
|
||||||
|
// note that we currently don't bother with breaking ties between samples
|
||||||
|
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||||
|
// unlikely that a collision will both occur and that the difference will
|
||||||
|
// be significant at HOM_VAR...
|
||||||
|
}
|
||||||
|
// if this person is already hom var, he can't add another alternate allele
|
||||||
|
// so we can ignore that case
|
||||||
|
}
|
||||||
|
if ( greedyIndex == -1 )
|
||||||
|
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||||
|
|
||||||
|
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||||
|
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||||
|
else
|
||||||
|
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getLikelihoodsOfFrequency() {
|
||||||
|
double likelihoods = 0.0;
|
||||||
|
int N = samples.size();
|
||||||
|
for (int i = 0; i < N; i++)
|
||||||
|
likelihoods += matrix[i][indexes[i]];
|
||||||
|
|
||||||
|
/*
|
||||||
|
System.out.println(frequency);
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
for (int j=0; j < 3; j++) {
|
||||||
|
System.out.print(String.valueOf(matrix[i][j]));
|
||||||
|
System.out.print(indexes[i] == j ? "* " : " ");
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
}
|
||||||
|
System.out.println(likelihoods);
|
||||||
|
System.out.println();
|
||||||
|
*/
|
||||||
|
|
||||||
|
recordGenotypes();
|
||||||
|
|
||||||
|
return likelihoods;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||||
|
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void recordGenotypes() {
|
||||||
|
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
for ( String sample : samples ) {
|
||||||
|
int genotype = indexes[index];
|
||||||
|
|
||||||
|
double score;
|
||||||
|
|
||||||
|
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||||
|
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||||
|
if ( genotype == maxEntry ) {
|
||||||
|
if ( genotype == GenotypeType.AA.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||||
|
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||||
|
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||||
|
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||||
|
}
|
||||||
|
// otherwise, we need to calculate the probability of the genotype
|
||||||
|
else {
|
||||||
|
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||||
|
double chosenGenotype = normalized[genotype];
|
||||||
|
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue