-Renamed new calculation model and worked out some significant xhanges with Mark
-Allow walkers calling the UG to pass in their own argument collections git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1896 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8e3f72ced9
commit
55fa1cfa06
|
|
@ -1,254 +0,0 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.*;
|
|
||||||
import org.broadinstitute.sting.utils.genotype.*;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel {
|
|
||||||
|
|
||||||
protected AllMAFsGenotypeCalculationModel() {}
|
|
||||||
|
|
||||||
// because the null allele frequencies are constant for a given N,
|
|
||||||
// we cache the results to avoid having to recompute everything
|
|
||||||
private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();
|
|
||||||
|
|
||||||
// because the Hardy-Weinberg values for a given frequency are constant,
|
|
||||||
// we cache the results to avoid having to recompute everything
|
|
||||||
private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
|
|
||||||
|
|
||||||
// the allele frequency priors
|
|
||||||
private double[] alleleFrequencyPriors;
|
|
||||||
|
|
||||||
// the allele frequency posteriors and P(f>0) mapped by alternate allele
|
|
||||||
private HashMap<Character, double[]> alleleFrequencyPosteriors = new HashMap<Character, double[]>();
|
|
||||||
private HashMap<Character, Double> PofFs = new HashMap<Character, Double>();
|
|
||||||
|
|
||||||
// the minimum and actual number of points in our allele frequency estimation
|
|
||||||
private static final int MIN_ESTIMATION_POINTS = 20; //1000;
|
|
||||||
private int frequencyEstimationPoints;
|
|
||||||
|
|
||||||
// the GenotypeLikelihoods map
|
|
||||||
private HashMap<String, AlleleSpecificGenotypeLikelihoods> GLs = new HashMap<String, AlleleSpecificGenotypeLikelihoods>();
|
|
||||||
|
|
||||||
|
|
||||||
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
|
||||||
|
|
||||||
// keep track of the context for each sample, overall and separated by strand
|
|
||||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
|
|
||||||
if ( contexts == null )
|
|
||||||
return null;
|
|
||||||
|
|
||||||
calculate(ref, contexts, priors, StratifiedContext.OVERALL);
|
|
||||||
|
|
||||||
|
|
||||||
//double lod = overall.getPofD() - overall.getPofNull();
|
|
||||||
//logger.debug("lod=" + lod);
|
|
||||||
|
|
||||||
// calculate strand score
|
|
||||||
//EMOutput forward = calculate(ref, contexts, priors, StratifiedContext.FORWARD);
|
|
||||||
//EMOutput reverse = calculate(ref, contexts, priors, StratifiedContext.REVERSE);
|
|
||||||
//double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
|
|
||||||
//double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
|
|
||||||
//logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
|
||||||
//double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
|
|
||||||
|
|
||||||
//logger.debug(String.format("LOD=%f, SLOD=%f", lod, strandScore));
|
|
||||||
|
|
||||||
// generate the calls
|
|
||||||
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
|
|
||||||
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void calculate(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
|
||||||
|
|
||||||
initializeAlleleFrequencies(contexts.size());
|
|
||||||
|
|
||||||
initializeGenotypeLikelihoods(ref, contexts, priors, contextType);
|
|
||||||
|
|
||||||
calculateAlleleFrequencyPosteriors(ref);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeAlleleFrequencies(int numSamples) {
|
|
||||||
|
|
||||||
// calculate the number of estimation points to use:
|
|
||||||
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
|
|
||||||
// (add 1 for allele frequency of zero)
|
|
||||||
frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
|
|
||||||
|
|
||||||
// set up the allele frequency priors
|
|
||||||
alleleFrequencyPriors = getNullalleleFrequencyPriors(frequencyEstimationPoints);
|
|
||||||
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
logger.debug("Null allele frequency for MAF=" + i + ": " + alleleFrequencyPriors[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
private double[] getNullalleleFrequencyPriors(int N) {
|
|
||||||
double[] AFs = nullAlleleFrequencyCache.get(N);
|
|
||||||
|
|
||||||
// if it hasn't been calculated yet, do so now
|
|
||||||
if ( AFs == null ) {
|
|
||||||
|
|
||||||
// calculate sum(1/i)
|
|
||||||
double denominator = 0.0;
|
|
||||||
for (int i = 1; i < N; i++)
|
|
||||||
denominator += 1.0 / (double)i;
|
|
||||||
|
|
||||||
// set up delta
|
|
||||||
double delta = 1.0 / denominator;
|
|
||||||
|
|
||||||
// calculate the null allele frequencies
|
|
||||||
AFs = new double[N];
|
|
||||||
for (int i = 1; i < N; i++)
|
|
||||||
AFs[i] = Math.log10(delta / (double)i);
|
|
||||||
|
|
||||||
nullAlleleFrequencyCache.put(N, AFs);
|
|
||||||
}
|
|
||||||
|
|
||||||
return AFs.clone();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, DiploidGenotypePriors priors, StratifiedContext contextType) {
|
|
||||||
GLs.clear();
|
|
||||||
|
|
||||||
for ( String sample : contexts.keySet() ) {
|
|
||||||
AlignmentContextBySample context = contexts.get(sample);
|
|
||||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
|
||||||
|
|
||||||
// create the GenotypeLikelihoods object
|
|
||||||
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
|
||||||
GL.setVerbose(VERBOSE);
|
|
||||||
GL.add(pileup, true);
|
|
||||||
|
|
||||||
GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void calculateAlleleFrequencyPosteriors(char ref) {
|
|
||||||
|
|
||||||
for ( char altAllele : BaseUtils.BASES ) {
|
|
||||||
if ( altAllele == ref )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
double[] AFPosteriors = new double[frequencyEstimationPoints];
|
|
||||||
|
|
||||||
for ( AlleleSpecificGenotypeLikelihoods GL : GLs.values() ) {
|
|
||||||
double[] PofDgivenG = GL.getNormalizedPosteriors(altAllele);
|
|
||||||
|
|
||||||
// calculate the posterior weighted frequencies for this sample
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++) {
|
|
||||||
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
|
||||||
double[] hardyWeinberg = getHardyWeinbergValues(f);
|
|
||||||
|
|
||||||
double PofDgivenAFi = 0.0;
|
|
||||||
PofDgivenAFi += hardyWeinberg[GenotypeType.REF.ordinal()] * PofDgivenG[GenotypeType.REF.ordinal()];
|
|
||||||
PofDgivenAFi += hardyWeinberg[GenotypeType.HET.ordinal()] * PofDgivenG[GenotypeType.HET.ordinal()];
|
|
||||||
PofDgivenAFi += hardyWeinberg[GenotypeType.HOM.ordinal()] * PofDgivenG[GenotypeType.HOM.ordinal()];
|
|
||||||
|
|
||||||
AFPosteriors[i] += Math.log10(PofDgivenAFi);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + AFPosteriors[i]);
|
|
||||||
|
|
||||||
// 1) multiply by null allele frequency priors to get AF posteriors
|
|
||||||
// 2) for precision purposes, we need to subtract the largest posterior value
|
|
||||||
// 3) convert back to normal space
|
|
||||||
double maxValue = findMaxEntry(AFPosteriors, true);
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
AFPosteriors[i] = Math.pow(10, alleleFrequencyPriors[i] + AFPosteriors[i] - maxValue);
|
|
||||||
|
|
||||||
// normalize
|
|
||||||
double sum = 0.0;
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
sum += AFPosteriors[i];
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
AFPosteriors[i] /= sum;
|
|
||||||
|
|
||||||
for (int i = 1; i < frequencyEstimationPoints; i++)
|
|
||||||
logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + ": " + AFPosteriors[i]);
|
|
||||||
logger.debug("P(f>0) for alt allele " + altAllele + ": " + sum);
|
|
||||||
|
|
||||||
alleleFrequencyPosteriors.put(altAllele, AFPosteriors);
|
|
||||||
PofFs.put(altAllele, sum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private double[] getHardyWeinbergValues(double f) {
|
|
||||||
double[] values = hardyWeinbergValueCache.get(f);
|
|
||||||
|
|
||||||
// if it hasn't been calculated yet, do so now
|
|
||||||
if ( values == null ) {
|
|
||||||
// p^2, 2pq, q^2
|
|
||||||
values = new double[] { Math.pow(1.0 - f, 2), 2.0 * (1.0 - f) * f, Math.pow(f, 2) };
|
|
||||||
hardyWeinbergValueCache.put(f, values);
|
|
||||||
}
|
|
||||||
|
|
||||||
return values;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static double findMaxEntry(double[] array, boolean skipZeroIndex) {
|
|
||||||
int index = skipZeroIndex ? 1 : 0;
|
|
||||||
double max = array[index++];
|
|
||||||
for (; index < array.length; index++) {
|
|
||||||
if ( array[index] > max )
|
|
||||||
max = array[index];
|
|
||||||
}
|
|
||||||
return max;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private enum GenotypeType { REF, HET, HOM }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A class for the allele-specific GL posteriors
|
|
||||||
*/
|
|
||||||
private class AlleleSpecificGenotypeLikelihoods {
|
|
||||||
// mapping from alt allele base to posteriors
|
|
||||||
private HashMap<Character, double[]> alleleGLs = new HashMap<Character, double[]>();
|
|
||||||
|
|
||||||
AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) {
|
|
||||||
double[] posteriors = GL.getPosteriors();
|
|
||||||
|
|
||||||
// get the ref data
|
|
||||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
|
||||||
double refPosterior = posteriors[refGenotype.ordinal()];
|
|
||||||
String refStr = String.valueOf(ref);
|
|
||||||
|
|
||||||
for ( char base : BaseUtils.BASES ) {
|
|
||||||
if ( base == ref )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// get hom var and het data
|
|
||||||
DiploidGenotype hetGenotype = ref < base ? DiploidGenotype.valueOf(refStr + String.valueOf(base)) : DiploidGenotype.valueOf(String.valueOf(base) + refStr);
|
|
||||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(base);
|
|
||||||
double hetPosterior = posteriors[hetGenotype.ordinal()];
|
|
||||||
double homPosterior = posteriors[homGenotype.ordinal()];
|
|
||||||
|
|
||||||
// for precision purposes, we need to add (or really subtract, since everything is negative)
|
|
||||||
// the largest posterior value from all entries so that numbers don't get too small
|
|
||||||
double maxValue = Math.max(Math.max(refPosterior, hetPosterior), homPosterior);
|
|
||||||
double normalizedRefPosterior = Math.pow(10, refPosterior - maxValue);
|
|
||||||
double normalizedHetPosterior = Math.pow(10, hetPosterior - maxValue);
|
|
||||||
double normalizedHomPosterior = Math.pow(10, homPosterior - maxValue);
|
|
||||||
|
|
||||||
// normalize
|
|
||||||
double sum = normalizedRefPosterior + normalizedHetPosterior + normalizedHomPosterior;
|
|
||||||
normalizedRefPosterior /= sum;
|
|
||||||
normalizedHetPosterior /= sum;
|
|
||||||
normalizedHomPosterior /= sum;
|
|
||||||
|
|
||||||
double[] altPosteriors = new double[] { normalizedRefPosterior, normalizedHetPosterior, normalizedHomPosterior };
|
|
||||||
alleleGLs.put(base, altPosteriors);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public double[] getNormalizedPosteriors(char altAllele) {
|
|
||||||
return alleleGLs.get(altAllele);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -21,7 +21,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
|
|
||||||
public enum Model {
|
public enum Model {
|
||||||
EM_POINT_ESTIMATE,
|
EM_POINT_ESTIMATE,
|
||||||
EM_ALL_MAFS
|
JOINT_ESTIMATE
|
||||||
}
|
}
|
||||||
|
|
||||||
protected BaseMismatchModel baseModel;
|
protected BaseMismatchModel baseModel;
|
||||||
|
|
@ -33,6 +33,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
protected boolean POOLED_INPUT;
|
protected boolean POOLED_INPUT;
|
||||||
protected int POOL_SIZE;
|
protected int POOL_SIZE;
|
||||||
protected double LOD_THRESHOLD;
|
protected double LOD_THRESHOLD;
|
||||||
|
protected double MINIMUM_ALLELE_FREQUENCY;
|
||||||
protected int maxDeletionsInPileup;
|
protected int maxDeletionsInPileup;
|
||||||
protected String assumedSingleSample;
|
protected String assumedSingleSample;
|
||||||
protected boolean VERBOSE;
|
protected boolean VERBOSE;
|
||||||
|
|
@ -62,6 +63,7 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
POOLED_INPUT = UAC.POOLED;
|
POOLED_INPUT = UAC.POOLED;
|
||||||
POOL_SIZE = UAC.POOLSIZE;
|
POOL_SIZE = UAC.POOLSIZE;
|
||||||
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
LOD_THRESHOLD = UAC.LOD_THRESHOLD;
|
||||||
|
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||||
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
maxDeletionsInPileup = UAC.MAX_DELETIONS;
|
||||||
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
|
||||||
VERBOSE = UAC.VERBOSE;
|
VERBOSE = UAC.VERBOSE;
|
||||||
|
|
@ -82,14 +84,16 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
||||||
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
|
gcm.GENOTYPE_MODE = GENOTYPE_MODE;
|
||||||
gcm.POOLED_INPUT = POOLED_INPUT;
|
gcm.POOLED_INPUT = POOLED_INPUT;
|
||||||
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
gcm.LOD_THRESHOLD = LOD_THRESHOLD;
|
||||||
|
gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY;
|
||||||
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
gcm.maxDeletionsInPileup = maxDeletionsInPileup;
|
||||||
gcm.assumedSingleSample = assumedSingleSample;
|
gcm.assumedSingleSample = assumedSingleSample;
|
||||||
gcm.VERBOSE = VERBOSE;
|
gcm.VERBOSE = VERBOSE;
|
||||||
return gcm;
|
return gcm;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAssumedSingleSample(String sample) {
|
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
|
||||||
assumedSingleSample = sample;
|
// just re-initialize
|
||||||
|
initialize(this.samples, this.logger, UAC);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -58,8 +58,8 @@ public class GenotypeCalculationModelFactory {
|
||||||
case EM_POINT_ESTIMATE:
|
case EM_POINT_ESTIMATE:
|
||||||
gcm = new PointEstimateGenotypeCalculationModel();
|
gcm = new PointEstimateGenotypeCalculationModel();
|
||||||
break;
|
break;
|
||||||
case EM_ALL_MAFS:
|
case JOINT_ESTIMATE:
|
||||||
gcm = new AllMAFsGenotypeCalculationModel();
|
gcm = new JointEstimateGenotypeCalculationModel();
|
||||||
break;
|
break;
|
||||||
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,284 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.*;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||||
|
|
||||||
|
protected JointEstimateGenotypeCalculationModel() {}
|
||||||
|
|
||||||
|
// because the null allele frequencies are constant for a given N,
|
||||||
|
// we cache the results to avoid having to recompute everything
|
||||||
|
private HashMap<Integer, double[]> nullAlleleFrequencyCache = new HashMap<Integer, double[]>();
|
||||||
|
|
||||||
|
// because the Hardy-Weinberg values for a given frequency are constant,
|
||||||
|
// we cache the results to avoid having to recompute everything
|
||||||
|
private HashMap<Double, Pair<double[], double[]>> hardyWeinbergValueCache = new HashMap<Double, Pair<double[], double[]>>();
|
||||||
|
|
||||||
|
// the allele frequency priors
|
||||||
|
private double[] log10AlleleFrequencyPriors;
|
||||||
|
|
||||||
|
// the allele frequency posteriors and P(f>0) for each alternate allele
|
||||||
|
private double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][];
|
||||||
|
private double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
||||||
|
private double[] PofFs = new double[BaseUtils.BASES.length];
|
||||||
|
|
||||||
|
// the minimum and actual number of points in our allele frequency estimation
|
||||||
|
private static final int MIN_ESTIMATION_POINTS = 100; //1000;
|
||||||
|
private int frequencyEstimationPoints;
|
||||||
|
|
||||||
|
// the GenotypeLikelihoods map
|
||||||
|
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||||
|
|
||||||
|
private enum GenotypeType { REF, HET, HOM }
|
||||||
|
|
||||||
|
|
||||||
|
public Pair<List<GenotypeCall>, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||||
|
|
||||||
|
// keep track of the context for each sample, overall and separated by strand
|
||||||
|
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
|
||||||
|
if ( contexts == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
calculate(ref, contexts, StratifiedContext.OVERALL);
|
||||||
|
|
||||||
|
|
||||||
|
//double lod = overall.getPofD() - overall.getPofNull();
|
||||||
|
//logger.debug("lod=" + lod);
|
||||||
|
|
||||||
|
// calculate strand score
|
||||||
|
//EMOutput forward = calculate(ref, contexts, priors, StratifiedContext.FORWARD);
|
||||||
|
//EMOutput reverse = calculate(ref, contexts, priors, StratifiedContext.REVERSE);
|
||||||
|
//double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
|
||||||
|
//double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
|
||||||
|
//logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
|
||||||
|
//double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
|
||||||
|
|
||||||
|
//logger.debug(String.format("LOD=%f, SLOD=%f", lod, strandScore));
|
||||||
|
|
||||||
|
// generate the calls
|
||||||
|
//GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
|
||||||
|
//return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void calculate(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||||
|
|
||||||
|
initializeAlleleFrequencies(contexts.size());
|
||||||
|
|
||||||
|
initializeGenotypeLikelihoods(ref, contexts, contextType);
|
||||||
|
|
||||||
|
calculateAlleleFrequencyPosteriors(ref);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeAlleleFrequencies(int numSamples) {
|
||||||
|
|
||||||
|
// calculate the number of estimation points to use:
|
||||||
|
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
|
||||||
|
// (add 1 for allele frequency of zero)
|
||||||
|
frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
|
||||||
|
|
||||||
|
// set up the allele frequency priors
|
||||||
|
log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints);
|
||||||
|
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||||
|
logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] getNullAlleleFrequencyPriors(int N) {
|
||||||
|
double[] AFs = nullAlleleFrequencyCache.get(N);
|
||||||
|
|
||||||
|
// if it hasn't been calculated yet, do so now
|
||||||
|
if ( AFs == null ) {
|
||||||
|
|
||||||
|
// calculate sum(1/i)
|
||||||
|
double sigma_1_over_I = 0.0;
|
||||||
|
for (int i = 1; i < N; i++)
|
||||||
|
sigma_1_over_I += 1.0 / (double)i;
|
||||||
|
|
||||||
|
// delta = theta / sum(1/i)
|
||||||
|
double delta = heterozygosity / sigma_1_over_I;
|
||||||
|
|
||||||
|
// calculate the null allele frequencies for 1-N
|
||||||
|
AFs = new double[N];
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i < N; i++) {
|
||||||
|
double value = delta / (double)i;
|
||||||
|
AFs[i] = Math.log10(value);
|
||||||
|
sum += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||||
|
AFs[0] = Math.log10(1.0 - sum);
|
||||||
|
|
||||||
|
nullAlleleFrequencyCache.put(N, AFs);
|
||||||
|
}
|
||||||
|
|
||||||
|
return AFs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||||
|
GLs.clear();
|
||||||
|
|
||||||
|
for ( String sample : contexts.keySet() ) {
|
||||||
|
AlignmentContextBySample context = contexts.get(sample);
|
||||||
|
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||||
|
|
||||||
|
// create the GenotypeLikelihoods object
|
||||||
|
GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform);
|
||||||
|
GL.setVerbose(VERBOSE);
|
||||||
|
GL.add(pileup, true);
|
||||||
|
|
||||||
|
GLs.put(sample, GL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void calculateAlleleFrequencyPosteriors(char ref) {
|
||||||
|
|
||||||
|
// initialization
|
||||||
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
alleleFrequencyPosteriors[baseIndex] = new double[frequencyEstimationPoints];
|
||||||
|
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
|
||||||
|
}
|
||||||
|
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||||
|
|
||||||
|
// for each minor allele frequency
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||||
|
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
||||||
|
|
||||||
|
DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref);
|
||||||
|
|
||||||
|
// for each sample
|
||||||
|
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||||
|
|
||||||
|
// set the GL prior to be the AF priors and get the corresponding posteriors
|
||||||
|
//GL.setPriors(hardyWeinberg);
|
||||||
|
GL.setPriors(new DiploidGenotypePriors());
|
||||||
|
double[] posteriors = GL.getPosteriors();
|
||||||
|
|
||||||
|
// get the ref data
|
||||||
|
double refPosterior = posteriors[refGenotype.ordinal()];
|
||||||
|
String refStr = String.valueOf(ref);
|
||||||
|
|
||||||
|
// for each alternate allele
|
||||||
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
if ( altAllele == ref )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
|
||||||
|
DiploidGenotype hetGenotype = ref < altAllele ? DiploidGenotype.valueOf(refStr + String.valueOf(altAllele)) : DiploidGenotype.valueOf(String.valueOf(altAllele) + refStr);
|
||||||
|
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele);
|
||||||
|
|
||||||
|
double[] allelePosteriors = new double[] { refPosterior, posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] };
|
||||||
|
normalizeFromLog10(allelePosteriors);
|
||||||
|
//logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]);
|
||||||
|
|
||||||
|
// calculate the posterior weighted frequencies
|
||||||
|
double[] HWvalues = hardyWeinbergValueCache.get(f).first;
|
||||||
|
double PofDgivenAFi = 0.0;
|
||||||
|
PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
|
||||||
|
PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
|
||||||
|
PofDgivenAFi += HWvalues[GenotypeType.HOM.ordinal()] * allelePosteriors[GenotypeType.HOM.ordinal()];
|
||||||
|
log10PofDgivenAFi[baseIndex][i] += Math.log10(PofDgivenAFi);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for each alternate allele
|
||||||
|
for ( char altAllele : BaseUtils.BASES ) {
|
||||||
|
if ( altAllele == ref )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||||
|
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||||
|
logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]);
|
||||||
|
|
||||||
|
// multiply by null allele frequency priors to get AF posteriors, then normalize
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||||
|
alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i];
|
||||||
|
normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]);
|
||||||
|
|
||||||
|
for (int i = 0; i < frequencyEstimationPoints; i++)
|
||||||
|
logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]);
|
||||||
|
|
||||||
|
// calculate p(f>0)
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 1; i < frequencyEstimationPoints; i++)
|
||||||
|
sum += alleleFrequencyPosteriors[baseIndex][i];
|
||||||
|
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
||||||
|
logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) {
|
||||||
|
Pair<double[], double[]> HWvalues = hardyWeinbergValueCache.get(f);
|
||||||
|
|
||||||
|
// if it hasn't been calculated yet, do so now
|
||||||
|
if ( HWvalues == null ) {
|
||||||
|
|
||||||
|
// create Hardy-Weinberg based allele frequencies (p^2, 2pq, q^2) converted to log-space
|
||||||
|
double p = 1.0 - f;
|
||||||
|
double q = f;
|
||||||
|
|
||||||
|
// allele frequencies don't actually equal 0...
|
||||||
|
if ( MathUtils.compareDoubles(q, 0.0) == 0 ) {
|
||||||
|
q = MINIMUM_ALLELE_FREQUENCY;
|
||||||
|
p -= MINIMUM_ALLELE_FREQUENCY;
|
||||||
|
} else if ( MathUtils.compareDoubles(p, 0.0) == 0 ) {
|
||||||
|
p = MINIMUM_ALLELE_FREQUENCY;
|
||||||
|
q -= MINIMUM_ALLELE_FREQUENCY;
|
||||||
|
}
|
||||||
|
|
||||||
|
double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) };
|
||||||
|
double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) };
|
||||||
|
HWvalues = new Pair<double[], double[]>(freqs, log10Freqs);
|
||||||
|
hardyWeinbergValueCache.put(f, HWvalues);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the priors based on the HW frequencies
|
||||||
|
double[] alleleFreqPriors = new double[10];
|
||||||
|
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||||
|
if ( g.isHomRef(ref) )
|
||||||
|
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()];
|
||||||
|
else if ( g.isHomVar(ref) )
|
||||||
|
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()];
|
||||||
|
else if ( g.isHetRef(ref) )
|
||||||
|
alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()];
|
||||||
|
else // g is het non-ref
|
||||||
|
alleleFreqPriors[g.ordinal()] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new DiploidGenotypePriors(alleleFreqPriors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void normalizeFromLog10(double[] array) {
|
||||||
|
// for precision purposes, we need to add (or really subtract, since they're
|
||||||
|
// all negative) the largest value; also, we need to convert to normal-space.
|
||||||
|
double maxValue = findMaxEntry(array);
|
||||||
|
for (int i = 0; i < array.length; i++)
|
||||||
|
array[i] = Math.pow(10, array[i] - maxValue);
|
||||||
|
|
||||||
|
// normalize
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 0; i < array.length; i++)
|
||||||
|
sum += array[i];
|
||||||
|
for (int i = 0; i < array.length; i++)
|
||||||
|
array[i] /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double findMaxEntry(double[] array) {
|
||||||
|
double max = array[0];
|
||||||
|
for (int index = 1; index < array.length; index++) {
|
||||||
|
if ( array[index] > max )
|
||||||
|
max = array[index];
|
||||||
|
}
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -31,7 +31,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
public class UnifiedArgumentCollection {
|
public class UnifiedArgumentCollection {
|
||||||
|
|
||||||
// control the various models to be used
|
// control the various models to be used
|
||||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with a pooled genotype calculation.", required = false)
|
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while JOINT_ESTIMATE is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with pooled data.", required = false)
|
||||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||||
|
|
||||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||||
|
|
@ -73,6 +73,8 @@ public class UnifiedArgumentCollection {
|
||||||
@Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
@Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false)
|
||||||
public Integer MAX_READS_IN_PILEUP = 10000;
|
public Integer MAX_READS_IN_PILEUP = 10000;
|
||||||
|
|
||||||
|
@Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false)
|
||||||
|
public double MINIMUM_ALLELE_FREQUENCY = 1e-8;
|
||||||
|
|
||||||
//@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false)
|
//@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false)
|
||||||
//public boolean disableCache = false;
|
//public boolean disableCache = false;
|
||||||
|
|
|
||||||
|
|
@ -82,13 +82,15 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<GenotypeCall>, Genot
|
||||||
/** Enable deletions in the pileup **/
|
/** Enable deletions in the pileup **/
|
||||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the single sample to assume when read groups are missing.
|
* Sets the argument collection for the UnifiedGenotyper.
|
||||||
* To be used with walkers that call the UnifiedGenotyper's map function
|
* To be used with walkers that call the UnifiedGenotyper's map function
|
||||||
|
* and consequently can't set these arguments on the command-line
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public void setAssumedSingleSample(String sample) {
|
public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) {
|
||||||
gcm.setAssumedSingleSample(sample);
|
gcm.setUnifiedArgumentCollection(UAC);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue