-Refactored JointEstimation code so that pooled calling will work
-Use phred-scale for fisher strand test -Use only 2N allele frequency estimation points git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2059 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
43bd4c8e8f
commit
a048f5cdf1
|
|
@ -31,7 +31,7 @@ public class FisherStrand implements VariantAnnotation {
|
|||
if ( pvalue == null )
|
||||
return null;
|
||||
|
||||
return new Pair<String, String>("FisherStrand", String.format("%.4f", pvalue));
|
||||
return new Pair<String, String>("FisherStrand", String.format("%.1f", -10.0 * Math.log10(pvalue)));
|
||||
}
|
||||
|
||||
public boolean useZeroQualityReads() { return false; }
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel {
|
||||
|
||||
protected DiploidGenotypeCalculationModel() {}
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
|
||||
|
||||
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
|
||||
return splitContextBySample(context);
|
||||
}
|
||||
|
||||
protected void initializeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
GLs.clear();
|
||||
|
||||
// use flat priors for GLs
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors();
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = new GenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.add(pileup, true);
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
}
|
||||
|
||||
protected double computeLog10PofDgivenAFi(DiploidGenotype refGenotype, DiploidGenotype hetGenotype, DiploidGenotype homGenotype, double f) {
|
||||
double PofDgivenAFi = 0.0;
|
||||
|
||||
// for each sample
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
double[] allelePosteriors = new double[] { posteriors[refGenotype.ordinal()], posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] };
|
||||
allelePosteriors = MathUtils.normalizeFromLog10(allelePosteriors);
|
||||
|
||||
// calculate the posterior weighted frequencies
|
||||
double[] HWvalues = getHardyWeinbergValues(f);
|
||||
double samplePofDgivenAFi = 0.0;
|
||||
samplePofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
|
||||
samplePofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
|
||||
samplePofDgivenAFi += HWvalues[GenotypeType.HOM.ordinal()] * allelePosteriors[GenotypeType.HOM.ordinal()];
|
||||
PofDgivenAFi += Math.log10(samplePofDgivenAFi);
|
||||
}
|
||||
|
||||
return PofDgivenAFi;
|
||||
}
|
||||
|
||||
protected List<Genotype> makeGenotypeCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
ArrayList<Genotype> calls = new ArrayList<Genotype>();
|
||||
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
|
||||
// create the call
|
||||
Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc);
|
||||
|
||||
if ( call instanceof ReadBacked ) {
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL));
|
||||
((ReadBacked)call).setPileup(pileup);
|
||||
}
|
||||
if ( call instanceof SampleBacked ) {
|
||||
((SampleBacked)call).setSampleName(sample);
|
||||
}
|
||||
if ( call instanceof LikelihoodsBacked ) {
|
||||
((LikelihoodsBacked)call).setLikelihoods(GLs.get(sample).getLikelihoods());
|
||||
}
|
||||
if ( call instanceof PosteriorsBacked ) {
|
||||
((PosteriorsBacked)call).setPosteriors(GLs.get(sample).getPosteriors());
|
||||
}
|
||||
|
||||
calls.add(call);
|
||||
}
|
||||
|
||||
return calls;
|
||||
}
|
||||
}
|
||||
|
|
@ -23,7 +23,8 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
|
||||
public enum Model {
|
||||
EM_POINT_ESTIMATE,
|
||||
JOINT_ESTIMATE
|
||||
JOINT_ESTIMATE,
|
||||
POOLED
|
||||
}
|
||||
|
||||
protected BaseMismatchModel baseModel;
|
||||
|
|
@ -34,7 +35,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
protected GenotypeWriterFactory.GENOTYPE_FORMAT OUTPUT_FORMAT;
|
||||
protected boolean ALL_BASE_MODE;
|
||||
protected boolean GENOTYPE_MODE;
|
||||
protected boolean POOLED_INPUT;
|
||||
protected int POOL_SIZE;
|
||||
protected double CONFIDENCE_THRESHOLD;
|
||||
protected double MINIMUM_ALLELE_FREQUENCY;
|
||||
|
|
@ -67,7 +67,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
OUTPUT_FORMAT = UAC.VAR_FORMAT;
|
||||
ALL_BASE_MODE = UAC.ALL_BASES;
|
||||
GENOTYPE_MODE = UAC.GENOTYPE;
|
||||
POOLED_INPUT = UAC.POOLED;
|
||||
POOL_SIZE = UAC.POOLSIZE;
|
||||
CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
|
||||
MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
|
||||
|
|
@ -144,19 +143,15 @@ public abstract class GenotypeCalculationModel implements Cloneable {
|
|||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
|
||||
// find the sample; special case for pools
|
||||
// find the sample
|
||||
String sample;
|
||||
if ( POOLED_INPUT ) {
|
||||
sample = "POOL";
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
if ( readGroup == null ) {
|
||||
if ( assumedSingleSample == null )
|
||||
throw new StingException("Missing read group for read " + read.getReadName());
|
||||
sample = assumedSingleSample;
|
||||
} else {
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
if ( readGroup == null ) {
|
||||
if ( assumedSingleSample == null )
|
||||
throw new StingException("Missing read group for read " + read.getReadName());
|
||||
sample = assumedSingleSample;
|
||||
} else {
|
||||
sample = readGroup.getSample();
|
||||
}
|
||||
sample = readGroup.getSample();
|
||||
}
|
||||
|
||||
// create a new context object if this is the first time we're seeing a read for this sample
|
||||
|
|
|
|||
|
|
@ -59,7 +59,10 @@ public class GenotypeCalculationModelFactory {
|
|||
gcm = new PointEstimateGenotypeCalculationModel();
|
||||
break;
|
||||
case JOINT_ESTIMATE:
|
||||
gcm = new JointEstimateGenotypeCalculationModel();
|
||||
gcm = new DiploidGenotypeCalculationModel();
|
||||
break;
|
||||
case POOLED:
|
||||
gcm = new PooledCalculationModel();
|
||||
break;
|
||||
default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel {
|
||||
|
||||
protected JointEstimateGenotypeCalculationModel() {}
|
||||
|
||||
|
|
@ -20,63 +20,53 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
// we cache the results to avoid having to recompute everything
|
||||
private HashMap<Double, double[]> hardyWeinbergValueCache = new HashMap<Double, double[]>();
|
||||
|
||||
protected enum GenotypeType { REF, HET, HOM }
|
||||
|
||||
// the allele frequency priors
|
||||
private double[] log10AlleleFrequencyPriors;
|
||||
protected double[] log10AlleleFrequencyPriors;
|
||||
|
||||
// the allele frequency posteriors and P(f>0) for each alternate allele
|
||||
private double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][];
|
||||
private double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
||||
private double[] PofFs = new double[BaseUtils.BASES.length];
|
||||
protected double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][];
|
||||
protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][];
|
||||
protected double[] PofFs = new double[BaseUtils.BASES.length];
|
||||
|
||||
// the minimum and actual number of points in our allele frequency estimation
|
||||
private static final int MIN_ESTIMATION_POINTS = 100;
|
||||
private int frequencyEstimationPoints;
|
||||
|
||||
// the GenotypeLikelihoods map
|
||||
private HashMap<String, GenotypeLikelihoods> GLs = new HashMap<String, GenotypeLikelihoods>();
|
||||
|
||||
private enum GenotypeType { REF, HET, HOM }
|
||||
|
||||
// these need to be implemented
|
||||
protected abstract HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context);
|
||||
protected abstract void initializeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType);
|
||||
protected abstract double computeLog10PofDgivenAFi(DiploidGenotype refGenotype, DiploidGenotype hetGenotype, DiploidGenotype homGenotype, double f);
|
||||
protected abstract List<Genotype> makeGenotypeCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc);
|
||||
|
||||
|
||||
public Pair<List<Genotype>, GenotypeLocusData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) {
|
||||
|
||||
// keep track of the context for each sample, overall and separated by strand
|
||||
HashMap<String, AlignmentContextBySample> contexts = splitContextBySample(context);
|
||||
HashMap<String, AlignmentContextBySample> contexts = createContexts(context);
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
// todo -- eric, can you refactor this into a superclass the manages the i over 2n calculation?
|
||||
//
|
||||
// The only thing that's different from pools and msg should be the calls to
|
||||
// initializeGenotypeLikelihoods and calculateAlleleFrequencyPosteriors
|
||||
// There's a lot of functionality here that can be separated into a superclass that requires
|
||||
// a few methods be overridden and it'll magically work for all data types, etc.
|
||||
//
|
||||
// Here are some examples:
|
||||
// getNumberSamples() ->
|
||||
// likelihoodsOfDGivenF() -> for MSG, pools, etc -- see below
|
||||
//
|
||||
int numSamples = contexts.size();
|
||||
int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero)
|
||||
|
||||
initializeAlleleFrequencies(contexts.size());
|
||||
initializeAlleleFrequencies(frequencyEstimationPoints);
|
||||
|
||||
// run joint estimation for the full GL contexts
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||
calculateAlleleFrequencyPosteriors(ref, context.getLocation());
|
||||
return createCalls(tracker, ref, contexts, context.getLocation());
|
||||
}
|
||||
initializeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
|
||||
private void initializeAlleleFrequencies(int numSamples) {
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null )
|
||||
printAlleleFrequencyData(ref, context.getLocation(), frequencyEstimationPoints);
|
||||
|
||||
// calculate the number of estimation points to use:
|
||||
// it's either MIN_ESTIMATION_POINTS or 2N if that's larger
|
||||
// (add 1 for allele frequency of zero)
|
||||
frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1;
|
||||
return createCalls(tracker, ref, contexts, context.getLocation(), frequencyEstimationPoints);
|
||||
}
|
||||
|
||||
private void initializeAlleleFrequencies(int frequencyEstimationPoints) {
|
||||
// set up the allele frequency priors
|
||||
log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints);
|
||||
}
|
||||
|
||||
private double[] getNullAlleleFrequencyPriors(int N) {
|
||||
protected double[] getNullAlleleFrequencyPriors(int N) {
|
||||
double[] AFs = nullAlleleFrequencyCache.get(N);
|
||||
|
||||
// if it hasn't been calculated yet, do so now
|
||||
|
|
@ -108,24 +98,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
return AFs;
|
||||
}
|
||||
|
||||
private void initializeGenotypeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
GLs.clear();
|
||||
|
||||
// use flat priors for GLs
|
||||
DiploidGenotypePriors priors = new DiploidGenotypePriors();
|
||||
|
||||
for ( String sample : contexts.keySet() ) {
|
||||
AlignmentContextBySample context = contexts.get(sample);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType));
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
GenotypeLikelihoods GL = new GenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
GL.add(pileup, true);
|
||||
GLs.put(sample, GL);
|
||||
}
|
||||
}
|
||||
|
||||
private void calculateAlleleFrequencyPosteriors(char ref, GenomeLoc verboseLocation) {
|
||||
protected void calculateAlleleFrequencyPosteriors(char ref, int frequencyEstimationPoints) {
|
||||
|
||||
// initialization
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
|
|
@ -134,53 +107,27 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints];
|
||||
}
|
||||
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref);
|
||||
String refStr = String.valueOf(ref);
|
||||
|
||||
//
|
||||
// todo -- if you invert this loop
|
||||
// foreach frequnecy
|
||||
// foreach altAllele
|
||||
//
|
||||
// then the pooled vs. msg calculation is just a function call difference
|
||||
// for each alternate allele
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
|
||||
// for each minor allele frequency
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
|
||||
// for each sample
|
||||
for ( GenotypeLikelihoods GL : GLs.values() ) {
|
||||
DiploidGenotype hetGenotype = ref < altAllele ? DiploidGenotype.valueOf(refStr + String.valueOf(altAllele)) : DiploidGenotype.valueOf(String.valueOf(altAllele) + refStr);
|
||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele);
|
||||
|
||||
double[] posteriors = GL.getPosteriors();
|
||||
|
||||
// get the ref data
|
||||
double refPosterior = posteriors[refGenotype.ordinal()];
|
||||
String refStr = String.valueOf(ref);
|
||||
|
||||
// for each alternate allele
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
continue;
|
||||
|
||||
int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
|
||||
|
||||
DiploidGenotype hetGenotype = ref < altAllele ? DiploidGenotype.valueOf(refStr + String.valueOf(altAllele)) : DiploidGenotype.valueOf(String.valueOf(altAllele) + refStr);
|
||||
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele);
|
||||
|
||||
double[] allelePosteriors = new double[] { refPosterior, posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] };
|
||||
allelePosteriors = MathUtils.normalizeFromLog10(allelePosteriors);
|
||||
//logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]);
|
||||
|
||||
// calculate the posterior weighted frequencies
|
||||
double[] HWvalues = getHardyWeinbergValues(f);
|
||||
double PofDgivenAFi = 0.0;
|
||||
PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()];
|
||||
PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()];
|
||||
PofDgivenAFi += HWvalues[GenotypeType.HOM.ordinal()] * allelePosteriors[GenotypeType.HOM.ordinal()];
|
||||
log10PofDgivenAFi[baseIndex][i] += Math.log10(PofDgivenAFi);
|
||||
}
|
||||
// for each minor allele frequency
|
||||
for (int i = 0; i < frequencyEstimationPoints; i++) {
|
||||
double f = (double)i / (double)(frequencyEstimationPoints-1);
|
||||
log10PofDgivenAFi[baseIndex][i] += computeLog10PofDgivenAFi(refGenotype, hetGenotype, homGenotype, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// todo -- shouldn't this be in a function for clarity?
|
||||
protected void calculatePofFs(char ref, int frequencyEstimationPoints) {
|
||||
// for each alternate allele
|
||||
for ( char altAllele : BaseUtils.BASES ) {
|
||||
if ( altAllele == ref )
|
||||
|
|
@ -199,13 +146,9 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
sum += alleleFrequencyPosteriors[baseIndex][i];
|
||||
PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors
|
||||
}
|
||||
|
||||
// print out stats if we have a position and a writer
|
||||
if ( verboseLocation != null && verboseWriter != null )
|
||||
printAlleleFrequencyData(ref, verboseLocation);
|
||||
}
|
||||
|
||||
private double[] getHardyWeinbergValues(double f) {
|
||||
protected double[] getHardyWeinbergValues(double f) {
|
||||
double[] HWvalues = hardyWeinbergValueCache.get(f);
|
||||
|
||||
// if it hasn't been calculated yet, do so now
|
||||
|
|
@ -231,7 +174,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
return HWvalues;
|
||||
}
|
||||
|
||||
private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
|
||||
protected void printAlleleFrequencyData(char ref, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||
|
||||
verboseWriter.println("Location=" + loc + ", ref=" + ref);
|
||||
StringBuilder header = new StringBuilder("MAF\tNullAFpriors\t");
|
||||
|
|
@ -267,7 +210,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
verboseWriter.println();
|
||||
}
|
||||
|
||||
private Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
protected Pair<List<Genotype>, GenotypeLocusData> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
|
||||
// first, find the alt allele with maximum confidence
|
||||
int indexOfMax = 0;
|
||||
char baseOfMax = ref;
|
||||
|
|
@ -290,31 +233,8 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
|
||||
return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
|
||||
|
||||
ArrayList<Genotype> calls = new ArrayList<Genotype>();
|
||||
|
||||
// first, populate the sample-specific data
|
||||
for ( String sample : GLs.keySet() ) {
|
||||
|
||||
// create the call
|
||||
AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
|
||||
Genotype call = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, context.getLocation());
|
||||
|
||||
if ( call instanceof ReadBacked ) {
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts.get(sample).getContext(StratifiedContext.OVERALL));
|
||||
((ReadBacked)call).setPileup(pileup);
|
||||
}
|
||||
if ( call instanceof SampleBacked ) {
|
||||
((SampleBacked)call).setSampleName(sample);
|
||||
}
|
||||
if ( call instanceof LikelihoodsBacked ) {
|
||||
((LikelihoodsBacked)call).setLikelihoods(GLs.get(sample).getLikelihoods());
|
||||
}
|
||||
if ( call instanceof PosteriorsBacked ) {
|
||||
((PosteriorsBacked)call).setPosteriors(GLs.get(sample).getPosteriors());
|
||||
}
|
||||
|
||||
calls.add(call);
|
||||
}
|
||||
// populate the sample-specific data
|
||||
List<Genotype> calls = makeGenotypeCalls(ref, contexts, loc);
|
||||
|
||||
// next, the general locus data
|
||||
// note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
|
|
@ -340,14 +260,16 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
|
|||
double lod = overallLog10PofF - overallLog10PofNull;
|
||||
|
||||
// the forward lod
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.FORWARD);
|
||||
calculateAlleleFrequencyPosteriors(ref, null);
|
||||
initializeLikelihoods(ref, contexts, StratifiedContext.FORWARD);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
double forwardLog10PofNull = Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||
double forwardLog10PofF = Math.log10(PofFs[indexOfMax]);
|
||||
|
||||
// the reverse lod
|
||||
initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.REVERSE);
|
||||
calculateAlleleFrequencyPosteriors(ref, null);
|
||||
initializeLikelihoods(ref, contexts, StratifiedContext.REVERSE);
|
||||
calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints);
|
||||
calculatePofFs(ref, frequencyEstimationPoints);
|
||||
double reverseLog10PofNull = Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
|
||||
double reverseLog10PofF = Math.log10(PofFs[indexOfMax]);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,61 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
public class PooledCalculationModel extends JointEstimateGenotypeCalculationModel {
|
||||
|
||||
protected PooledCalculationModel() {}
|
||||
|
||||
|
||||
protected HashMap<String, AlignmentContextBySample> createContexts(AlignmentContext context) {
|
||||
// for testing purposes, we may want to throw multi-samples at pooled mode,
|
||||
// so we can't use the standard splitContextBySample() method here
|
||||
|
||||
AlignmentContextBySample pooledContext = new AlignmentContextBySample(context.getLocation());
|
||||
|
||||
int deletionsInPileup = 0;
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
|
||||
for (int i = 0; i < reads.size(); i++) {
|
||||
|
||||
// check for deletions
|
||||
int offset = offsets.get(i);
|
||||
if ( offset == -1 ) {
|
||||
// are there too many deletions in the pileup?
|
||||
if ( ++deletionsInPileup > maxDeletionsInPileup && maxDeletionsInPileup >= 0 )
|
||||
return null;
|
||||
}
|
||||
|
||||
// add the read to this sample's context
|
||||
// note that bad bases are added to the context (for DoC calculations later)
|
||||
pooledContext.add(reads.get(i), offset);
|
||||
}
|
||||
|
||||
HashMap<String, AlignmentContextBySample> contexts = new HashMap<String, AlignmentContextBySample>();
|
||||
contexts.put("POOL", pooledContext);
|
||||
return contexts;
|
||||
}
|
||||
|
||||
protected void initializeLikelihoods(char ref, HashMap<String, AlignmentContextBySample> contexts, StratifiedContext contextType) {
|
||||
return;
|
||||
}
|
||||
|
||||
protected double computeLog10PofDgivenAFi(DiploidGenotype refGenotype, DiploidGenotype hetGenotype, DiploidGenotype homGenotype, double f) {
|
||||
|
||||
// TODO -- IMPLEMENT ME
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
protected List<Genotype> makeGenotypeCalls(char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
|
||||
// no genotypes in pooled mode
|
||||
return new ArrayList<Genotype>();
|
||||
}
|
||||
}
|
||||
|
|
@ -32,7 +32,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
|||
public class UnifiedArgumentCollection {
|
||||
|
||||
// control the various models to be used
|
||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while JOINT_ESTIMATE is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with pooled data.", required = false)
|
||||
@Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while JOINT_ESTIMATE and POOLED are under development.", required = false)
|
||||
public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE;
|
||||
|
||||
@Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
|
|
@ -41,10 +41,7 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false)
|
||||
public boolean POOLED = false;
|
||||
|
||||
@Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool", required = false)
|
||||
@Argument(fullName = "poolSize", shortName = "ps", doc = "Number of individuals in the pool (for POOLED model only)", required = false)
|
||||
public int POOLSIZE = 0;
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -94,8 +94,8 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
|
|||
public void initialize() {
|
||||
|
||||
// deal with input errors
|
||||
if ( UAC.POOLED && UAC.genotypeModel == GenotypeCalculationModel.Model.EM_POINT_ESTIMATE ) {
|
||||
throw new StingException("This was an attempt to use an EM Point Estimate model with pooled genotype calculations. This model does not work with pooled data.");
|
||||
if ( UAC.POOLSIZE > 0 && UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED ) {
|
||||
throw new StingException("Attempting to use a model other than POOLED with pooled data. Please set the model to POOLED.");
|
||||
}
|
||||
if ( UAC.LOD_THRESHOLD > Double.MIN_VALUE ) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import java.util.Arrays;
|
|||
|
||||
public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
||||
public static String baseTestString() {
|
||||
return "-T VariantAnnotator -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 20:10,000,000-10,010,000 -vcf %s";
|
||||
return "-T VariantAnnotator -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,010,000 -vcf %s";
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1PointEM() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("b992e55996942c893948ea85660478ba"));
|
||||
Arrays.asList("37e112d619c2b6f213554edd10262523"));
|
||||
executeTest("testMultiSamplePilot1 - Point Estimate EM", spec);
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +68,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("90c5129f298075ee0e18233b3763f25d"));
|
||||
Arrays.asList("e4d32d41544fc12c1553f780be3fd272"));
|
||||
executeTest("testMultiSamplePilot1 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("033390940ecc0e2dcda5559d6a1802fa"));
|
||||
Arrays.asList("d0bfdedca0944a0b05d53f1a75408a57"));
|
||||
executeTest("testMultiSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2Joint() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,067,000-10,083,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1,
|
||||
Arrays.asList("8deb8b1132e7ddf28c7a0d919ce22985"));
|
||||
Arrays.asList("e2e4c147075186245d8eb1f8d3bb8705"));
|
||||
executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue