Now that G's new genotyping algorithm is live, I've cleaned up the code to completely separate the grid search from the exact model. AlleleFrequencyCalculationModel is now completely abstract.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4517 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
80e5ac65b4
commit
d54d9880d7
|
|
@ -29,15 +29,9 @@ import org.apache.log4j.Logger;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
|
@ -54,154 +48,14 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
GRID_SEARCH
|
||||
}
|
||||
|
||||
private static final double LOG10_REFERENCE_CALL_EPSILON = 1.0;
|
||||
|
||||
protected int N;
|
||||
protected AlleleFrequencyMatrix AFMatrix;
|
||||
protected Set<BiallelicGenotypeLikelihoods> refCalls;
|
||||
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected boolean useReferenceSampleOptimization;
|
||||
protected enum GenotypeType { AA, AB, BB }
|
||||
|
||||
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter, boolean useReferenceSampleOptimization) {
|
||||
this.N = N;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
this.useReferenceSampleOptimization = useReferenceSampleOptimization;
|
||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||
refCalls = new HashSet<BiallelicGenotypeLikelihoods>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||
* @param minFrequencyToCalculate the minimum frequency which needs to be calculated
|
||||
*/
|
||||
public abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int minFrequencyToCalculate);
|
||||
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param contexts alignment contexts
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// increment the grid
|
||||
for (int i = 1; i <= AFofMaxLikelihood; i++) {
|
||||
// add one more alternate allele
|
||||
AFMatrix.incrementFrequency();
|
||||
}
|
||||
|
||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
int frequency) {
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// first, the potential alt calls
|
||||
for ( String sample : AFMatrix.getSamples() ) {
|
||||
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||
Allele alleleA = GL.getAlleleA();
|
||||
Allele alleleB = GL.getAlleleB();
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(frequency, sample);
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleA);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleB);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||
myAlleles.add(alleleB);
|
||||
myAlleles.add(alleleB);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||
}
|
||||
|
||||
// now, the clearly ref calls
|
||||
for ( BiallelicGenotypeLikelihoods GL : refCalls ) {
|
||||
String sample = GL.getSample();
|
||||
|
||||
Allele ref = GL.getAlleleA();
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(ref);
|
||||
myAlleles.add(ref);
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
|
||||
double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods());
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
protected void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
refCalls.clear();
|
||||
AFMatrix.clear();
|
||||
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
|
||||
if ( useReferenceSampleOptimization && isClearRefCall(GL) ) {
|
||||
refCalls.add(GL);
|
||||
} else {
|
||||
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) {
|
||||
if ( GL.getAlleleA().isNonReference() )
|
||||
return false;
|
||||
|
||||
double[] likelihoods = GL.getLikelihoods();
|
||||
double refLikelihoodMinusEpsilon = likelihoods[0] - LOG10_REFERENCE_CALL_EPSILON;
|
||||
return ( refLikelihoodMinusEpsilon > likelihoods[1] && refLikelihoodMinusEpsilon > likelihoods[2]);
|
||||
}
|
||||
|
||||
private int getUnfilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
|
||||
protected class CalculatedAlleleFrequency {
|
||||
|
||||
|
|
@ -214,157 +68,49 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
}
|
||||
}
|
||||
|
||||
protected enum GenotypeType { AA, AB, BB }
|
||||
protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE;
|
||||
protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
}
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param tracker rod data
|
||||
* @param ref reference context
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
|
||||
* @param minFrequencyToCalculate the minimum frequency which needs to be calculated
|
||||
*/
|
||||
protected abstract void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int minFrequencyToCalculate);
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int maxN; // total possible frequencies in data
|
||||
private int frequency; // current frequency
|
||||
/**
|
||||
* Can be overridden by concrete subclasses
|
||||
* @param contexts alignment contexts
|
||||
* @param GLs genotype likelihoods
|
||||
* @param log10AlleleFrequencyPosteriors allele frequency results
|
||||
* @param AFofMaxLikelihood allele frequency of max likelihood
|
||||
*
|
||||
* @return calls
|
||||
*/
|
||||
protected abstract Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood);
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
maxN = N;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
clear();
|
||||
protected int getUnfilteredDepth(ReadBackedPileup pileup) {
|
||||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( DiploidSNPGenotypeLikelihoods.usableBase(p, true) )
|
||||
count++;
|
||||
}
|
||||
|
||||
public List<String> getSamples() { return samples; }
|
||||
|
||||
public void clear() {
|
||||
frequency = 0;
|
||||
for (int i = 0; i < maxN; i++)
|
||||
indexes[i] = 0;
|
||||
samples.clear();
|
||||
samplesToGenotypesPerAF.clear();
|
||||
}
|
||||
|
||||
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
||||
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
||||
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
||||
}
|
||||
|
||||
public void setLikelihoods(double[] GLs, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
int N = samples.size();
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
int N = samples.size();
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
// it's possible that the genotype conformation hash wasn't initialized
|
||||
if ( samplesToGenotypesPerAF.get(frequency) == null ) {
|
||||
// confirm that the current frequency is the target one
|
||||
if ( frequency != this.frequency )
|
||||
throw new IllegalStateException("Attempting to retrieve genotypes from an uninitialized hash for frequency " + frequency + " (the hash is current standing at frequency " + this.frequency + ")");
|
||||
recordGenotypes();
|
||||
}
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.AA.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
|
@ -46,7 +46,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
private static final double LOGEPS = -300;
|
||||
|
||||
protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(N, logger, verboseWriter, true);
|
||||
super(N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
|
|
@ -151,25 +151,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
|
||||
// overriding implementation in AlleleFrequencyCalculationModel to avoid filling out AF matrix which is not used.
|
||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||
}
|
||||
|
||||
protected Map<String, Genotype> generateCalls(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
int bestAFguess) {
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][bestAFguess+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][bestAFguess+1];
|
||||
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1];
|
||||
|
||||
ArrayList<String> sampleIndices = new ArrayList<String>();
|
||||
int sampleIdx = 0;
|
||||
|
||||
// todo - optimize initialization
|
||||
for (int k=0; k <= bestAFguess; k++)
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++)
|
||||
for (int j=0; j <= GLs.size(); j++)
|
||||
pathMetricArray[j][k] = -1e30;
|
||||
|
||||
|
|
@ -186,7 +178,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
double[] likelihoods = GLs.get(sample).getLikelihoods();
|
||||
|
||||
for (int k=0; k <= bestAFguess; k++) {
|
||||
for (int k=0; k <= AFofMaxLikelihood; k++) {
|
||||
|
||||
double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0];
|
||||
int bestIndex = k;
|
||||
|
|
@ -214,7 +206,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
}
|
||||
}
|
||||
|
||||
int startIdx = bestAFguess;
|
||||
int startIdx = AFofMaxLikelihood;
|
||||
for (int k = sampleIdx; k > 0; k--) {
|
||||
int bestGTguess;
|
||||
String sample = sampleIndices.get(k-1);
|
||||
|
|
@ -263,7 +255,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
|
||||
}
|
||||
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,9 +27,15 @@ package org.broadinstitute.sting.playground.gatk.walkers.genotyper;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.Genotype;
|
||||
import org.broad.tribble.util.variantcontext.Allele;
|
||||
import org.broad.tribble.util.variantcontext.GenotypeLikelihoods;
|
||||
import org.broad.tribble.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -40,17 +46,19 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
|||
// how much off from the max likelihoods do we need to be before we can quit calculating?
|
||||
protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0;
|
||||
|
||||
private AlleleFrequencyMatrix AFMatrix;
|
||||
|
||||
protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(N, logger, verboseWriter, false);
|
||||
super(N, logger, verboseWriter);
|
||||
AFMatrix = new AlleleFrequencyMatrix(N);
|
||||
}
|
||||
|
||||
public void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int minFrequencyToCalculate) {
|
||||
|
||||
protected void getLog10PNonRef(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int minFrequencyToCalculate) {
|
||||
initializeAFMatrix(GLs);
|
||||
|
||||
// first, calculate for AF=0 (no change to matrix)
|
||||
|
|
@ -90,10 +98,192 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
|
|||
*
|
||||
* @return calls
|
||||
*/
|
||||
public Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
return generateCalls(contexts, GLs, AFofMaxLikelihood);
|
||||
protected Map<String, Genotype> assignGenotypes(Map<String, StratifiedAlignmentContext> contexts,
|
||||
Map<String, BiallelicGenotypeLikelihoods> GLs,
|
||||
double[] log10AlleleFrequencyPosteriors,
|
||||
int AFofMaxLikelihood) {
|
||||
HashMap<String, Genotype> calls = new HashMap<String, Genotype>();
|
||||
|
||||
// first, the potential alt calls
|
||||
for ( String sample : AFMatrix.getSamples() ) {
|
||||
BiallelicGenotypeLikelihoods GL = GLs.get(sample);
|
||||
Allele alleleA = GL.getAlleleA();
|
||||
Allele alleleB = GL.getAlleleB();
|
||||
|
||||
// set the genotype and confidence
|
||||
Pair<Integer, Double> AFbasedGenotype = AFMatrix.getGenotype(AFofMaxLikelihood, sample);
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleA);
|
||||
} else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) {
|
||||
myAlleles.add(alleleA);
|
||||
myAlleles.add(alleleB);
|
||||
} else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() )
|
||||
myAlleles.add(alleleB);
|
||||
myAlleles.add(alleleB);
|
||||
}
|
||||
|
||||
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
attributes.put(VCFConstants.DEPTH_KEY, getUnfilteredDepth(contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup()));
|
||||
|
||||
GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods());
|
||||
attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString());
|
||||
|
||||
calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
|
||||
private void initializeAFMatrix(Map<String, BiallelicGenotypeLikelihoods> GLs) {
|
||||
AFMatrix.clear();
|
||||
|
||||
for ( BiallelicGenotypeLikelihoods GL : GLs.values() )
|
||||
AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample());
|
||||
}
|
||||
|
||||
protected static class AlleleFrequencyMatrix {
|
||||
|
||||
private double[][] matrix; // allele frequency matrix
|
||||
private int[] indexes; // matrix to maintain which genotype is active
|
||||
private int maxN; // total possible frequencies in data
|
||||
private int frequency; // current frequency
|
||||
|
||||
// data structures necessary to maintain a list of the best genotypes and their scores
|
||||
private ArrayList<String> samples = new ArrayList<String>();
|
||||
private HashMap<Integer, HashMap<String, Pair<Integer, Double>>> samplesToGenotypesPerAF = new HashMap<Integer, HashMap<String, Pair<Integer, Double>>>();
|
||||
|
||||
public AlleleFrequencyMatrix(int N) {
|
||||
maxN = N;
|
||||
matrix = new double[N][3];
|
||||
indexes = new int[N];
|
||||
clear();
|
||||
}
|
||||
|
||||
public List<String> getSamples() { return samples; }
|
||||
|
||||
public void clear() {
|
||||
frequency = 0;
|
||||
for (int i = 0; i < maxN; i++)
|
||||
indexes[i] = 0;
|
||||
samples.clear();
|
||||
samplesToGenotypesPerAF.clear();
|
||||
}
|
||||
|
||||
public void setLikelihoods(double AA, double AB, double BB, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = AA;
|
||||
matrix[index][GenotypeType.AB.ordinal()] = AB;
|
||||
matrix[index][GenotypeType.BB.ordinal()] = BB;
|
||||
}
|
||||
|
||||
public void setLikelihoods(double[] GLs, String sample) {
|
||||
int index = samples.size();
|
||||
samples.add(sample);
|
||||
matrix[index][GenotypeType.AA.ordinal()] = GLs[0];
|
||||
matrix[index][GenotypeType.AB.ordinal()] = GLs[1];
|
||||
matrix[index][GenotypeType.BB.ordinal()] = GLs[2];
|
||||
}
|
||||
|
||||
public void incrementFrequency() {
|
||||
int N = samples.size();
|
||||
if ( frequency == 2 * N )
|
||||
throw new ReviewedStingException("Frequency was incremented past N; how is this possible?");
|
||||
frequency++;
|
||||
|
||||
double greedy = VALUE_NOT_CALCULATED;
|
||||
int greedyIndex = -1;
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
if ( indexes[i] == GenotypeType.AB.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
}
|
||||
else if ( indexes[i] == GenotypeType.AA.ordinal() ) {
|
||||
if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) {
|
||||
greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()];
|
||||
greedyIndex = i;
|
||||
}
|
||||
// note that we currently don't bother with breaking ties between samples
|
||||
// (which would be done by looking at the HOM_VAR value) because it's highly
|
||||
// unlikely that a collision will both occur and that the difference will
|
||||
// be significant at HOM_VAR...
|
||||
}
|
||||
// if this person is already hom var, he can't add another alternate allele
|
||||
// so we can ignore that case
|
||||
}
|
||||
if ( greedyIndex == -1 )
|
||||
throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?");
|
||||
|
||||
if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() )
|
||||
indexes[greedyIndex] = GenotypeType.BB.ordinal();
|
||||
else
|
||||
indexes[greedyIndex] = GenotypeType.AB.ordinal();
|
||||
}
|
||||
|
||||
public double getLikelihoodsOfFrequency() {
|
||||
double likelihoods = 0.0;
|
||||
int N = samples.size();
|
||||
for (int i = 0; i < N; i++)
|
||||
likelihoods += matrix[i][indexes[i]];
|
||||
|
||||
/*
|
||||
System.out.println(frequency);
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j=0; j < 3; j++) {
|
||||
System.out.print(String.valueOf(matrix[i][j]));
|
||||
System.out.print(indexes[i] == j ? "* " : " ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
System.out.println(likelihoods);
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
recordGenotypes();
|
||||
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
public Pair<Integer, Double> getGenotype(int frequency, String sample) {
|
||||
return samplesToGenotypesPerAF.get(frequency).get(sample);
|
||||
}
|
||||
|
||||
private void recordGenotypes() {
|
||||
HashMap<String, Pair<Integer, Double>> samplesToGenotypes = new HashMap<String, Pair<Integer, Double>>();
|
||||
|
||||
int index = 0;
|
||||
for ( String sample : samples ) {
|
||||
int genotype = indexes[index];
|
||||
|
||||
double score;
|
||||
|
||||
int maxEntry = MathUtils.maxElementIndex(matrix[index]);
|
||||
// if the max value is for the most likely genotype, we can compute next vs. next best
|
||||
if ( genotype == maxEntry ) {
|
||||
if ( genotype == GenotypeType.AA.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else if ( genotype == GenotypeType.AB.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]);
|
||||
else // ( genotype == GenotypeType.HOM.ordinal() )
|
||||
score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]);
|
||||
}
|
||||
// otherwise, we need to calculate the probability of the genotype
|
||||
else {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(matrix[index]);
|
||||
double chosenGenotype = normalized[genotype];
|
||||
score = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
|
||||
samplesToGenotypes.put(sample, new Pair<Integer, Double>(genotype, Math.abs(score)));
|
||||
index++;
|
||||
}
|
||||
|
||||
samplesToGenotypesPerAF.put(frequency, samplesToGenotypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue