diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java old mode 100755 new mode 100644 index 99c0ddfda..3a9f943e2 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java @@ -1,23 +1,37 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.ReadBackedPileup; -import java.util.List; +import java.util.*; + +public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel { -public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel { protected AllMAFsGenotypeCalculationModel() {} - public List calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + protected double[] initializeAlleleFrequencies(int numSamples, int[] baseCounts) { + double[] alleleFrequencies = new double[2 * numSamples + 1]; // 2N + 1 possible minor alleles - // This is just a stub... - // TODO --- implement me - // In this alternative approach we calculate the likelihoods of all possible - // MAFs (1-N) and choose the most likely one. We can probably be smart and - // avoid calculating MAFs that we know can't be the most likely... + return alleleFrequencies; + } + + protected HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) { + return null; + } + + protected double[] calculateAlleleFrequencyPosteriors(HashMap GLs) { + return null; + } + + protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies) { + + } + + protected EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext) { return null; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 05d6de128..2f7948713 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -4,224 +4,119 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import java.util.*; -public class EMGenotypeCalculationModel extends GenotypeCalculationModel { +public abstract class EMGenotypeCalculationModel extends GenotypeCalculationModel { // We need to set a limit on the EM iterations in case something flukey goes on - private static final int MAX_EM_ITERATIONS = 6; + protected static final int MAX_EM_ITERATIONS = 6; // We consider the EM stable when the MAF doesn't change more than 1/10N - private static final double EM_STABILITY_METRIC = 0.1; + protected static final double EM_STABILITY_METRIC = 0.1; // keep track of some metrics about our calls - private CallMetrics callsMetrics = new CallMetrics(); + protected CallMetrics callsMetrics = new CallMetrics(); protected EMGenotypeCalculationModel() {} public List calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { - // keep track of the GenotypeLikelihoods for each sample, separated by strand - HashMap GLs = new HashMap(); - - // keep track of the total counts of each base in the pileup + // keep track of the context for each sample, overall and separated by strand int[] baseCounts = new int[4]; - int totalObservedBases = 0; - - // First order of business: create the initial GenotypeLikelihoods objects. - // Also, confirm that there aren't too many deletions in the pileup. - int deletionsInPile = 0; - List reads = context.getReads(); - List offsets = context.getOffsets(); - - for (int i = 0; i < reads.size(); i++) { - - // get the read and offset - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - // skip deletions - if ( offset == -1 ) { - // are there too many deletions in the pileup? - if ( ++deletionsInPile > maxDeletionsInPileup ) - return null; - continue; - } - - // get the base; don't use bad bases - char base = read.getReadString().charAt(offset); - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - if ( baseIndex == -1 ) - continue; - - // create the GL holder object if this is the first time we're seeing a base for this sample - String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out - String sample = read.getHeader().getReadGroup(readGroup).getSample(); - UnifiedGenotypeLikelihoods myGLs = GLs.get(sample); - if ( myGLs == null ) { - myGLs = new UnifiedGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform, VERBOSE); - GLs.put(sample, myGLs); - } - - // assign the base to the appropriate strand - myGLs.add(base, read, offset); - - // update the base counts - baseCounts[baseIndex]++; - totalObservedBases++; - } - - // for now, we need to special-case single sample mode - if ( samples.size() == 1 ) { - String sample = samples.iterator().next(); - UnifiedGenotypeLikelihoods UGL = GLs.get(sample); - // if there were no good bases, the likelihoods object wouldn't exist - if ( UGL == null ) - return null; - - callsMetrics.nCalledBases++; - UGL.setPriors(priors); - GenotypeCall call = new GenotypeCall(sample,context.getLocation(), ref, UGL.getGenotypeLikelihoods(), new ReadBackedPileup(ref, context)); - - if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) { - double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); - if ( confidence >= LOD_THRESHOLD ) { - callsMetrics.nConfidentCalls++; - out.addGenotypeCall(call); - } else { - callsMetrics.nNonConfidentCalls++; - } - } - return Arrays.asList(call); - } - - callsMetrics.nCalledBases++; - - // Next, we need to create initial allele frequencies. - // An intelligent guess would be the observed base ratios (plus some small number to account for sampling issues). - - double[] alleleFrequencies = new double[4]; - for (int i = 0; i < 4; i++) - alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases; - DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); - - for ( UnifiedGenotypeLikelihoods UGL : GLs.values() ) - UGL.setPriors(AFPriors); + HashMap contexts = splitContextBySample(context, baseCounts); + if ( contexts == null ) + return null; // debugging output for (int i = 0; i < 4; i++) - logger.debug("Base count and initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i] + ", " + alleleFrequencies[i]); + logger.debug("Base count for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i]); - - // The EM loop: - // we want to continue until the calculation is stable, but we need some max on the number of iterations - int iterations = 0; - // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N - double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)GLs.size()); - boolean isStable = false; + // run the EM calculation + EMOutput overall = runEM(ref, contexts, priors, baseCounts, StratifiedContext.OVERALL); + double lod = overall.getPofD() - overall.getPofNull(); + logger.debug("lod=" + lod); - while ( !isStable && iterations < MAX_EM_ITERATIONS ) { + // calculate strand score + EMOutput forward = runEM(ref, contexts, priors, baseCounts, StratifiedContext.FORWARD); + EMOutput reverse = runEM(ref, contexts, priors, baseCounts, StratifiedContext.REVERSE); + double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); + double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); + logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + double strandScore = Math.max(forwardLod - lod, reverseLod - lod); - // calculate the posterior-weighted allele frequencies and modify the priors accordingly - double[] newAlleleFrequencies = getPosteriorWeightedFrequencies(GLs.values()); - AFPriors = calculateAlleleFreqBasedPriors(newAlleleFrequencies); - for ( UnifiedGenotypeLikelihoods UGL : GLs.values() ) - UGL.setPriors(AFPriors); + // TODO -- finish me... - // determine whether we're stable - double AF_delta = 0.0; - for (int i = 0; i < 4; i++) { - AF_delta += Math.abs(alleleFrequencies[i] - newAlleleFrequencies[i]); - logger.debug("Previous allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i] + ", vs. new frequency: " + newAlleleFrequencies[i]); - } + System.out.println(String.format("LOD=%f, SLOD=%f", lod, strandScore)); - isStable = AF_delta < EM_STABILITY_VALUE; - iterations++; - - alleleFrequencies = newAlleleFrequencies; - } - - logger.debug("EM loop took " + iterations + " iterations"); - - if (true) - throw new RuntimeException("DEBUGGING"); - - // compute actual priors: theta / MAF - double pF = computeThetaOverMAF(ref, alleleFrequencies, GLs.size()); - - // the posteriors from the EM loop are the population likelihoods here - // TODO -- finish me - - -/* - ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; - pD = Compute_pD(G, em_result.sample_weights); - pNull = Compute_pNull(contexts, em_result.sample_weights); - - // Apply p(f). - double pVar = 0.0; - for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } - - double p0 = Math.log10(1 - pVar); - double pF; - - double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); - - if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } - else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } - - pD = pD + pF; - pNull = pNull + p0; - - lod = pD - pNull; - return lod; - -*/ - + // make a call + // List calls //return new GenotypeCall(context.getLocation(), ref,gl, pileup); - //out.addMultiSampleCall((Genotype)calls); + //out.addMultiSampleCall((Genotype)calls, GenotypeMetaData metadata); return null; } - double[] getPosteriorWeightedFrequencies(Collection UGLs) { - double[] newAlleleLikelihoods = new double[4]; + public EMOutput runEM(char ref, HashMap contexts, DiploidGenotypePriors priors, int[] baseCounts, StratifiedContext contextType) { - for ( UnifiedGenotypeLikelihoods UGL : UGLs ) { + // get initial allele frequencies + double[] alleleFrequencies = initializeAlleleFrequencies(contexts.size(), baseCounts); + for (int i = 0; i < alleleFrequencies.length; i++) + logger.debug("Initial allele frequency for i=" + i + ": " + alleleFrequencies[i]); - // calculate the posterior weighted frequencies for this sample - double[] personalAllelePosteriors = new double[4]; - double sum = 0; - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double posterior = Math.pow(10, UGL.getGenotypeLikelihoods().getPosterior(g)); - personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior; - personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior; - sum += 2.0 * posterior; - } + // get the initial genotype likelihoods + HashMap GLs = initializeGenotypeLikelihoods(ref, contexts, alleleFrequencies, priors, contextType); - // normalize - for (int i = 0; i < 4; i++) - personalAllelePosteriors[i] /= sum; - for (int i = 0; i < 4; i++) - newAlleleLikelihoods[i] += personalAllelePosteriors[i]; - } + callsMetrics.nCalledBases++; + + // The EM loop: + // we want to continue until the calculation is stable, but we need some max on the number of iterations + int iterations = 0; + boolean EM_IS_STABLE; - // normalize - double sum = 0; - for (int i = 0; i < 4; i++) - sum += newAlleleLikelihoods[i]; - for (int i = 0; i < 4; i++) - newAlleleLikelihoods[i] /= sum; + do { + double[] newAlleleFrequencies = calculateAlleleFrequencyPosteriors(GLs); + for (int i = 0; i < alleleFrequencies.length; i++) + logger.debug("New allele frequency for i=" + i + ": " + newAlleleFrequencies[i]); - return newAlleleLikelihoods; + applyAlleleFrequencyToGenotypeLikelihoods(GLs, newAlleleFrequencies); + + EM_IS_STABLE = isStable(alleleFrequencies, newAlleleFrequencies, contexts.size()); + + alleleFrequencies = newAlleleFrequencies; + + } while ( ++iterations < MAX_EM_ITERATIONS && !EM_IS_STABLE ); + + logger.debug("EM loop took " + iterations + " iterations"); + for ( String sample : GLs.keySet() ) + logger.debug("GenotypeLikelihoods for sample " + sample + ": " + GLs.get(sample).toString()); + + return computePofF(ref, GLs, alleleFrequencies, contexts.size()); } - private DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { + protected abstract double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts); + protected abstract HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType); + protected abstract double[] calculateAlleleFrequencyPosteriors(HashMap GLs); + protected abstract void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies); + protected abstract EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext); + + + protected boolean isStable(double[] oldAlleleFrequencies, double[] newAlleleFrequencies, int numSamplesInContext) { + // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N + double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)numSamplesInContext); + + // determine whether we're stable + double AF_delta = 0.0; + for (int i = 0; i < oldAlleleFrequencies.length; i++) + AF_delta += Math.abs(oldAlleleFrequencies[i] - newAlleleFrequencies[i]); + + return (AF_delta < EM_STABILITY_VALUE); + } + + protected DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { // convert to log-space double[] log10Freqs = new double[4]; for (int i = 0; i < 4; i++) @@ -240,25 +135,66 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel { return new DiploidGenotypePriors(alleleFreqPriors); } - private double computeThetaOverMAF(char ref, double[] alleleFrequencies, int samplesAtLocus) { - double MAF; - Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies); - if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) ) - MAF = alleleFrequencies[sortedIndexes[3]]; - else - MAF = alleleFrequencies[sortedIndexes[2]]; + /** + * Create the mapping from sample to alignment context; also, fill in the base counts along the way. + * Returns null iff there are too many deletions at this position. + */ + protected HashMap splitContextBySample(AlignmentContext context, int[] baseCounts) { - int expectedChrs = (int)(2.0 * (double)samplesAtLocus * MAF); + HashMap contexts = new HashMap(); + for (int i = 0; i < 4; i++) + baseCounts[i] = 0; - // TODO -- we need to use the priors from the UnifiedArgumentCollection - return DiploidGenotypePriors.HUMAN_HETEROZYGOSITY / (double)expectedChrs; + int deletionsInPile = 0; + List reads = context.getReads(); + List offsets = context.getOffsets(); + + for (int i = 0; i < reads.size(); i++) { + + // get the read and offset + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + // find the sample; special case for pools + String sample; + if ( POOLED_INPUT ) { + sample = "POOL"; + } else { + String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out + sample = read.getHeader().getReadGroup(readGroup).getSample(); + } + + // create a new context object if this is the first time we're seeing a read for this sample + AlignmentContextBySample myContext = contexts.get(sample); + if ( myContext == null ) { + myContext = new AlignmentContextBySample(context.getLocation()); + contexts.put(sample, myContext); + } + + // check for deletions + if ( offset == -1 ) { + // are there too many deletions in the pileup? + if ( ++deletionsInPile > maxDeletionsInPileup ) + return null; + } else { + // add bad bases to the context (for DoC calculations), but don't count them + int baseIndex = BaseUtils.simpleBaseToBaseIndex(read.getReadString().charAt(offset)); + if ( baseIndex != -1 ) + baseCounts[baseIndex]++; + } + + // add the read to this sample's context + myContext.add(read, offset); + } + + return contexts; } /** * A class to keep track of some basic metrics about our calls */ - private class CallMetrics { + protected class CallMetrics { long nConfidentCalls = 0; long nNonConfidentCalls = 0; long nCalledBases = 0; @@ -266,8 +202,96 @@ public class EMGenotypeCalculationModel extends GenotypeCalculationModel { CallMetrics() {} public String toString() { - return String.format("SSG: %d confident and %d non-confident calls were made at %d bases", + return String.format("UG: %d confident and %d non-confident calls were made at %d bases", nConfidentCalls, nNonConfidentCalls, nCalledBases); } } + + + /** + * A class to keep track of the EM output + */ + protected class EMOutput { + private double pD, pNull, pF; + private HashMap GLs; + + EMOutput(double pD, double pNull, double pF, HashMap GLs) { + this.pD = pD; + this.pNull = pNull; + this.pF = pF; + this.GLs = GLs; + } + + public double getPofD() { return pD; } + public double getPofNull() { return pNull; } + public double getPofF() { return pF; } + public HashMap getGenotypeLikelihoods() { return GLs; } + } + + + /** + * A class to keep track of the alignment context observed for a given sample. + * we currently store the overall context and strand-stratified sets, + * but any arbitrary stratification could be added. + */ + protected enum StratifiedContext { OVERALL, FORWARD, REVERSE } + protected class AlignmentContextBySample { + + private AlignmentContext overall = null; + private AlignmentContext forward = null; + private AlignmentContext reverse = null; + private GenomeLoc loc; + + private ArrayList allReads = new ArrayList(); + private ArrayList forwardReads = new ArrayList(); + private ArrayList reverseReads = new ArrayList(); + + private ArrayList allOffsets = new ArrayList(); + private ArrayList forwardOffsets = new ArrayList(); + private ArrayList reverseOffsets = new ArrayList(); + + + AlignmentContextBySample(GenomeLoc loc) { + this.loc = loc; + } + + public AlignmentContext getContext(StratifiedContext context) { + switch ( context ) { + case OVERALL: return getOverallContext(); + case FORWARD: return getForwardContext(); + case REVERSE: return getReverseContext(); + } + return null; + } + + private AlignmentContext getOverallContext() { + if ( overall == null ) + overall = new AlignmentContext(loc, allReads, allOffsets); + return overall; + } + + private AlignmentContext getForwardContext() { + if ( forward == null ) + forward = new AlignmentContext(loc, forwardReads, forwardOffsets); + return forward; + } + + private AlignmentContext getReverseContext() { + if ( reverse == null ) + reverse = new AlignmentContext(loc, reverseReads, reverseOffsets); + return reverse; + } + + public void add(SAMRecord read, int offset) { + if ( read.getReadNegativeStrandFlag() ) { + reverseReads.add(read); + reverseOffsets.add(offset); + } else { + forwardReads.add(read); + forwardOffsets.add(offset); + } + allReads.add(read); + allOffsets.add(offset); + } + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 134050157..db62965b7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -14,16 +14,18 @@ import java.util.*; public abstract class GenotypeCalculationModel implements Cloneable { public enum Model { - EM, - ALL_MAFS + EM_POINT_ESTIMATE, + EM_ALL_MAFS } protected BaseMismatchModel baseModel; protected Set samples; - protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; protected GenotypeWriter out; protected Logger logger; + protected double heterozygosity; + protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; protected boolean GENOTYPE_MODE; + protected boolean POOLED_INPUT; protected double LOD_THRESHOLD; protected int maxDeletionsInPileup; protected boolean VERBOSE; @@ -37,34 +39,26 @@ public abstract class GenotypeCalculationModel implements Cloneable { /** * Initialize the GenotypeCalculationModel object * Assumes that out is not null - * @param baseModel base model to use * @param samples samples in input bam - * @param platform default platform * @param out output writer - * @param out logger - * @param genotypeMode genotyping - * @param lod lod threshold - * @param maxDeletions max deletions to tolerate - * @param verbose verbose flag + * @param logger logger + * @param UAC unified arg collection */ - protected void initialize(BaseMismatchModel baseModel, - Set samples, - EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, - GenotypeWriter out, - Logger logger, - boolean genotypeMode, - double lod, - int maxDeletions, - boolean verbose) { - this.baseModel = baseModel; + protected void initialize(Set samples, + GenotypeWriter out, + Logger logger, + UnifiedArgumentCollection UAC) { this.samples = samples; - defaultPlatform = platform; this.out = out; this.logger = logger; - GENOTYPE_MODE = genotypeMode; - LOD_THRESHOLD = lod; - maxDeletionsInPileup = maxDeletions; - VERBOSE = verbose; + baseModel = UAC.baseModel; + heterozygosity = UAC.heterozygosity; + defaultPlatform = UAC.defaultPlatform; + GENOTYPE_MODE = UAC.GENOTYPE; + POOLED_INPUT = UAC.POOLED; + LOD_THRESHOLD = UAC.LOD_THRESHOLD; + maxDeletionsInPileup = UAC.MAX_DELETIONS; + VERBOSE = UAC.VERBOSE; } /** @@ -74,12 +68,14 @@ public abstract class GenotypeCalculationModel implements Cloneable { */ protected Object clone() throws CloneNotSupportedException { GenotypeCalculationModel gcm = (GenotypeCalculationModel)super.clone(); - gcm.baseModel = baseModel; gcm.samples = new HashSet(samples); - gcm.defaultPlatform = defaultPlatform; gcm.out = out; gcm.logger = logger; + gcm.baseModel = baseModel; + gcm.heterozygosity = heterozygosity; + gcm.defaultPlatform = defaultPlatform; gcm.GENOTYPE_MODE = GENOTYPE_MODE; + gcm.POOLED_INPUT = POOLED_INPUT; gcm.LOD_THRESHOLD = LOD_THRESHOLD; gcm.maxDeletionsInPileup = maxDeletionsInPileup; gcm.VERBOSE = VERBOSE; @@ -88,6 +84,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { /** * Must be overridden by concrete subclasses + * @param tracker rod data * @param ref reference base * @param context alignment context * @param priors priors to use for GL diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index 330a02516..100fd1957 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -52,22 +52,22 @@ public class GenotypeCalculationModelFactory { * @param logger logger * @return model */ - public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC, - Set samples, + public static GenotypeCalculationModel makeGenotypeCalculation(Set samples, GenotypeWriter out, - Logger logger) { + Logger logger, + UnifiedArgumentCollection UAC) { GenotypeCalculationModel gcm; switch ( UAC.genotypeModel ) { - case EM: - gcm = new EMGenotypeCalculationModel(); + case EM_POINT_ESTIMATE: + gcm = new PointEstimateGenotypeCalculationModel(); break; - case ALL_MAFS: + case EM_ALL_MAFS: gcm = new AllMAFsGenotypeCalculationModel(); break; default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); } - gcm.initialize(UAC.baseModel, samples, UAC.defaultPlatform, out, logger, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + gcm.initialize(samples, out, logger, UAC); return gcm; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java index 88274e596..de7cabc4b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCall.java @@ -121,7 +121,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return get the one minus error value */ - @Override public double getNegLog10PError() { return Math.abs(mGenotypeLikelihoods.getPosterior(getBestGenotype()) - mGenotypeLikelihoods.getPosterior(getNextBest())); } @@ -149,7 +148,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return the bases, as a string */ - @Override public String getBases() { return getBestGenotype().toString(); } @@ -159,7 +157,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return the ploidy value */ - @Override public int getPloidy() { return 2; } @@ -169,7 +166,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return true if we're homozygous, false otherwise */ - @Override public boolean isHom() { return getBestGenotype().isHom(); } @@ -179,7 +175,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return true if we're het, false otherwise */ - @Override public boolean isHet() { return !isHom(); } @@ -190,7 +185,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return position on the genome wrapped in GenomeLoc object */ - @Override public GenomeLoc getLocation() { return this.mLocation; } @@ -200,7 +194,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return true is a SNP */ - @Override public boolean isPointGenotype() { return true; } @@ -212,7 +205,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return true if we're a variant */ - @Override public boolean isVariant(char ref) { return !Utils.dupString(this.getReference(), 2).equals(getBestGenotype().toString()); } @@ -287,13 +279,11 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return */ - @Override public double[] getPosteriors() { return this.mGenotypeLikelihoods.getPosteriors(); } /** @return returns the sample name for this genotype */ - @Override public String getSampleName() { return this.mSampleName; } @@ -303,7 +293,6 @@ public class GenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Like * * @return a string, representing the genotyping value */ - @Override public String getFilteringValue() { return "0"; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java new file mode 100755 index 000000000..5195189df --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -0,0 +1,204 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import java.util.*; + +public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculationModel { + + protected PointEstimateGenotypeCalculationModel() {} + + // overload this method so we can special-case the single sample + public List calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + // we don't actually want to run EM for single samples + if ( samples.size() == 1 ) { + + // split the context (so we can get forward/reverse contexts for free) + HashMap contexts = splitContextBySample(context, new int[4]); + if ( contexts == null ) + return null; + + // get the context for the sample + String sample = samples.iterator().next(); + AlignmentContextBySample sampleContext = contexts.get(sample); + + // if there were no good bases, the context wouldn't exist + if ( sampleContext == null ) + return null; + + callsMetrics.nCalledBases++; + + // create the genotype call object + Pair discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL); + GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, discoveryGL.second, discoveryGL.first); + + if ( GENOTYPE_MODE || call.isVariant(call.getReference()) ) { + double confidence = (GENOTYPE_MODE ? call.getNegLog10PError() : call.toVariation().getNegLog10PError()); + if ( confidence >= LOD_THRESHOLD ) { + callsMetrics.nConfidentCalls++; + out.addGenotypeCall(call); + } else { + callsMetrics.nNonConfidentCalls++; + } + } + return Arrays.asList(call); + } + + return super.calculateGenotype(tracker, ref, context, priors); + } + + private Pair getSingleSampleLikelihoods(char ref, AlignmentContextBySample sampleContext, DiploidGenotypePriors priors, StratifiedContext contextType) { + // create the pileup + AlignmentContext myContext = sampleContext.getContext(contextType); + ReadBackedPileup pileup = new ReadBackedPileup(ref, myContext); + pileup.setIncludeDeletionsInPileupString(true); + + // create the GenotypeLikelihoods object + GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); + GL.setVerbose(VERBOSE); + GL.add(pileup, true); + return new Pair(pileup, GL); + } + + protected double[] initializeAlleleFrequencies(int numSamplesInContext, int[] baseCounts) { + // An intelligent guess would be the observed base ratios + // (plus some small number to account for sampling issues). + int totalObservedBases = 0; + for (int i = 0; i < 4; i++) + totalObservedBases += baseCounts[i]; + + double[] alleleFrequencies = new double[4]; + for (int i = 0; i < 4; i++) + alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases; + return alleleFrequencies; + } + + protected HashMap initializeGenotypeLikelihoods(char ref, HashMap contexts, double[] alleleFrequencies, DiploidGenotypePriors priors, StratifiedContext contextType) { + HashMap GLs = new HashMap(); + + DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); + + for ( String sample : contexts.keySet() ) { + AlignmentContextBySample context = contexts.get(sample); + ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); + + // create the GenotypeLikelihoods object + GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, AFPriors, defaultPlatform); + GL.setVerbose(VERBOSE); + GL.add(pileup, true); + + GLs.put(sample, GL); + } + + return GLs; + } + + protected double[] calculateAlleleFrequencyPosteriors(HashMap GLs) { + double[] newAlleleLikelihoods = new double[4]; + + for ( GenotypeLikelihoods GL : GLs.values() ) { + double[] normalizedPosteriors = GL.getNormalizedPosteriors(); + + // calculate the posterior weighted frequencies for this sample + double[] personalAllelePosteriors = new double[4]; + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double posterior = normalizedPosteriors[g.ordinal()] / 2.0; // each base gets half the probability + personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior; + personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior; + } + + for (int i = 0; i < 4; i++) + newAlleleLikelihoods[i] += personalAllelePosteriors[i]; + } + + // normalize + double sum = 0; + for (int i = 0; i < 4; i++) + sum += newAlleleLikelihoods[i]; + for (int i = 0; i < 4; i++) + newAlleleLikelihoods[i] /= sum; + + return newAlleleLikelihoods; + } + + protected void applyAlleleFrequencyToGenotypeLikelihoods(HashMap GLs, double[] alleleFrequencies) { + DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); + for ( GenotypeLikelihoods GL : GLs.values() ) + GL.setPriors(AFPriors); + } + + protected EMOutput computePofF(char ref, HashMap GLs, double[] alleleFrequencies, int numSamplesInContext) { + + // compute pD and pNull without allele frequencies + double pD = compute_pD(GLs); + double pNull = compute_pNull(ref, GLs); + logger.debug("Original pD=" + pD + ", pNull=" + pNull); + + // compute p0 + double pVar = 0.0; + for (int i = 1; i < numSamplesInContext; i++) + pVar += heterozygosity/(double)i; + double p0 = Math.log10(1.0 - pVar); + + // compute actual priors: theta / MAF + double MAF; + Integer[] sortedIndexes = Utils.SortPermutation(alleleFrequencies); + if ( sortedIndexes[3] != BaseUtils.simpleBaseToBaseIndex(ref) ) + MAF = alleleFrequencies[sortedIndexes[3]]; + else + MAF = alleleFrequencies[sortedIndexes[2]]; + + // compute pF + double pF; + double expectedChromosomes = 2.0 * (double)numSamplesInContext * MAF; + if ( expectedChromosomes < 1.0 ) + pF = p0; + else + pF = Math.log10(heterozygosity / expectedChromosomes); + logger.debug("p0=" + p0 + ", pF=" + pF); + + pD += pF; + pNull += p0; + logger.debug("Final pD=" + pD + ", pNull=" + pNull); + + return new EMOutput(pD, pNull, pF, GLs); + } + + private double compute_pD(HashMap GLs) { + double pD = 0.0; + for ( GenotypeLikelihoods GL : GLs.values() ) { + double sum = 0.0; + for ( DiploidGenotype g : DiploidGenotype.values() ) { + sum += Math.pow(10, GL.getPosterior(g)); + } + pD += Math.log10(sum); + } + return pD; + } + + private double compute_pNull(char ref, HashMap GLs) { + // compute null likelihoods + double[] alleleLikelihoods = new double[4]; + for (int i = 0; i < 4; i++) + alleleLikelihoods[i] = 1e-6/3.0; + alleleLikelihoods[BaseUtils.simpleBaseToBaseIndex(ref)] = 1.0-1e-6; + DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleLikelihoods); + + HashMap GL_null = new HashMap(); + try { + for ( String sample : GLs.keySet() ) { + GenotypeLikelihoods GL = (GenotypeLikelihoods)GLs.get(sample).clone(); + GL.setPriors(AFPriors); + GL_null.put(sample, GL); + } + } catch (CloneNotSupportedException e) { + throw new StingException("Clone() not supported for given GenotypeLikelihoods subclass?"); + } + + return compute_pD(GL_null); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index c8e2e3ab6..c0fa1830d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -26,29 +26,23 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; -import java.io.File; -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Sep 29, 2009 - * Time: 12:07:43 PM - * To change this template use File | Settings | File Templates. - */ public class UnifiedArgumentCollection { // control the various models to be used - @Argument(fullName = "genotypeModel", shortName = "gm", doc = "Genotype calculation model to employ -- EM is the default choice; the ALL_MAFS model is experimental", required = false) - public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM; + @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development", required = false) + public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE; - @Argument(fullName = "baseModel", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) + @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL; @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY; + @Argument(fullName = "pooled", shortName = "pooled", doc = "Does the input bam represent pooled data (so that genotypes can't be called)?", required = false) + public boolean POOLED = false; + // control the output @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false) @@ -65,10 +59,10 @@ public class UnifiedArgumentCollection { @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false) public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null; - @Argument(fullName = "maxDeletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false) + @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false) public Integer MAX_DELETIONS = 1; - @Argument(fullName = "maxCoverage", shortName = "maxCoverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false) + @Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false) public Integer MAX_READS_IN_PILEUP = 10000; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypeLikelihoods.java deleted file mode 100755 index d06ba7c89..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotypeLikelihoods.java +++ /dev/null @@ -1,58 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.ReadBackedPileup; - -import net.sf.samtools.SAMRecord; - - -/** - A wrapper around the GenotypeLikelihoods class for the UnifiedGenotyper. - This class incorporates both strand-based likelihoods and a combined likelihoods over both strands. -*/ - -public class UnifiedGenotypeLikelihoods { - - private GenotypeLikelihoods forwardStrandGL; - private GenotypeLikelihoods reverseStrandGL; - private GenotypeLikelihoods combinedGL; - - public UnifiedGenotypeLikelihoods(BaseMismatchModel baseModel, DiploidGenotypePriors priors, EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform, boolean VERBOSE) { - forwardStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); - forwardStrandGL.setVerbose(VERBOSE); - reverseStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); - reverseStrandGL.setVerbose(VERBOSE); - combinedGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); - combinedGL.setVerbose(VERBOSE); - } - - public GenotypeLikelihoods getForwardStrandGenotypeLikelihoods() { return forwardStrandGL; } - public GenotypeLikelihoods getReverseStrandGenotypeLikelihoods() { return reverseStrandGL; } - public GenotypeLikelihoods getGenotypeLikelihoods() { return combinedGL; } - - public void add(ReadBackedPileup pileup) { - for (int i = 0; i < pileup.getReads().size(); i++) { - int offset = pileup.getOffsets().get(i); - // ignore deletions - if ( offset == -1 ) - continue; - - SAMRecord read = pileup.getReads().get(i); - char base = read.getReadString().charAt(offset); - add(base, read, offset); - } - } - - public void add(char base, SAMRecord read, int offset) { - if ( !read.getReadNegativeStrandFlag() ) - forwardStrandGL.add(base, read.getBaseQualities()[offset], read, offset); - else - reverseStrandGL.add(base, read.getBaseQualities()[offset], read, offset); - combinedGL.add(base, read.getBaseQualities()[offset], read, offset); - } - - public void setPriors(DiploidGenotypePriors priors) { - forwardStrandGL.setPriors(priors); - reverseStrandGL.setPriors(priors); - combinedGL.setPriors(priors); - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9e11833e6..7a8ecd5d8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -55,8 +55,8 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) public File VARIANTS_FILE = null; - @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false) - public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; + @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used; default is VCF", required = false) + public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF; // the model used for calculating genotypes @@ -79,10 +79,10 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { return (BaseUtils.simpleBaseToBaseIndex(ref) != -1 && context.getReads().size() != 0); } - /** Enable deletions in the pileup */ + /** Enable deletions in the pileup **/ public boolean includeReadsWithDeletionAtLoci() { return true; } - /** Initialize the walker with some sensible defaults */ + /** Initialize the samples, output, and genotype calculation model **/ public void initialize() { // get all of the unique sample names @@ -104,7 +104,7 @@ public class UnifiedGenotyper extends LocusWalker, Integer> { writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out, "UnifiedGenotyper", this.getToolkit().getArguments().referenceFile.getName()); - gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer, logger); + gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, writer, logger, UAC); } /**