diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index d1d25d4a9..b911168e3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.apache.log4j.Logger; import java.util.*; @@ -22,35 +23,45 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected Set samples; protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform; protected GenotypeWriter out; + protected Logger logger; protected boolean GENOTYPE_MODE; protected double LOD_THRESHOLD; protected int maxDeletionsInPileup; protected boolean VERBOSE; /** - * Create a new GenotypeCalculationModel object with given debugging level + * Create a new GenotypeCalculationModel object + */ + protected GenotypeCalculationModel() {} + + + /** + * Initialize the GenotypeCalculationModel object * Assumes that out is not null * @param baseModel base model to use * @param samples samples in input bam * @param platform default platform * @param out output writer + * @param out logger * @param genotypeMode genotyping * @param lod lod threshold * @param maxDeletions max deletions to tolerate * @param verbose verbose flag */ - public GenotypeCalculationModel(BaseMismatchModel baseModel, - Set samples, - EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, - GenotypeWriter out, - boolean genotypeMode, - double lod, - int maxDeletions, - boolean verbose) { + protected void initialize(BaseMismatchModel baseModel, + Set samples, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, + GenotypeWriter out, + Logger logger, + boolean genotypeMode, + double lod, + int maxDeletions, + boolean verbose) { this.baseModel = baseModel; this.samples = samples; defaultPlatform = platform; this.out = out; + this.logger = logger; GENOTYPE_MODE = genotypeMode; LOD_THRESHOLD = lod; maxDeletionsInPileup = maxDeletions; @@ -68,6 +79,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { gcm.samples = new HashSet(samples); gcm.defaultPlatform = defaultPlatform; gcm.out = out; + gcm.logger = logger; gcm.GENOTYPE_MODE = GENOTYPE_MODE; gcm.LOD_THRESHOLD = LOD_THRESHOLD; gcm.maxDeletionsInPileup = maxDeletionsInPileup; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index 9a0b52f30..18f14c7f1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.apache.log4j.Logger; import java.util.Set; @@ -47,17 +48,29 @@ public class GenotypeCalculationModelFactory { * * @param UAC The unified argument collection * @param samples samples in bam - * @param out output + * @param out output writer + * @param logger logger * @return model */ public static GenotypeCalculationModel makeGenotypeCalculation(UnifiedArgumentCollection UAC, Set samples, - GenotypeWriter out) { + GenotypeWriter out, + Logger logger) { + GenotypeCalculationModel gcm; switch ( UAC.genotypeModel ) { - case SINGLE_SAMPLE: return new SingleSampleGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); - //case MULTI_SAMPLE_EM: return new MultiSampleEMGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); - case MULTI_SAMPLE_ALL_MAFS: return new MultiSampleAllMAFsGenotypeCalculationModel(UAC.baseModel, samples, UAC.defaultPlatform, out, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + case SINGLE_SAMPLE: + gcm = new SingleSampleGenotypeCalculationModel(); + break; + case MULTI_SAMPLE_EM: + gcm = new MultiSampleEMGenotypeCalculationModel(); + break; + case MULTI_SAMPLE_ALL_MAFS: + gcm = new MultiSampleAllMAFsGenotypeCalculationModel(); + break; default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); } + + gcm.initialize(UAC.baseModel, samples, UAC.defaultPlatform, out, logger, UAC.GENOTYPE, UAC.LOD_THRESHOLD, UAC.MAX_DELETIONS, UAC.VERBOSE); + return gcm; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 6d765bb1a..676366ee2 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -192,6 +192,7 @@ public abstract class GenotypeLikelihoods implements Cloneable { */ public void setPriors(DiploidGenotypePriors priors) { this.priors = priors; + posteriors = zeros.clone(); for ( DiploidGenotype g : DiploidGenotype.values() ) { int i = g.ordinal(); posteriors[i] = priors.getPriors()[i] + likelihoods[i]; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java index e94258462..149f845af 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleAllMAFsGenotypeCalculationModel.java @@ -3,31 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.apache.log4j.Logger; import java.util.Set; public class MultiSampleAllMAFsGenotypeCalculationModel extends GenotypeCalculationModel { - public MultiSampleAllMAFsGenotypeCalculationModel(BaseMismatchModel baseModel, - Set samples, - EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, - GenotypeWriter out, - boolean genotypeMode, - double lod, - int maxDeletions, - boolean verbose) { - super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose); - } - - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); - } + protected MultiSampleAllMAFsGenotypeCalculationModel() {} public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleEMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleEMGenotypeCalculationModel.java new file mode 100755 index 000000000..1a457ba59 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiSampleEMGenotypeCalculationModel.java @@ -0,0 +1,252 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; + +import java.util.*; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + +public class MultiSampleEMGenotypeCalculationModel extends GenotypeCalculationModel { + + // We need to set a limit on the EM iterations in case something flukey goes on + private static final int MAX_EM_ITERATIONS = 6; + + // We consider the EM stable when the MAF doesn't change more than 1/10N + private static final double EM_STABILITY_METRIC = 0.1; + + + protected MultiSampleEMGenotypeCalculationModel() {} + + public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + // keep track of the GenotpeLikelihoods for each sample, separated by strand + HashMap GLs = new HashMap(); + + // keep track of the total counts of each base in the pileup + int[] baseCounts = new int[4]; + int totalObservedBases = 0; + + // First order of business: create the initial GenotypeLikelihoods objects. + // Also, confirm that there aren't too many deletions in the pileup. + int deletionsInPile = 0; + List reads = context.getReads(); + List offsets = context.getOffsets(); + + for (int i = 0; i < reads.size(); i++) { + + // set up the base data + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + char base = read.getReadString().charAt(offset); + + // don't use bad bases + int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + if ( baseIndex == -1 ) { + // are there too many deletions in the pileup? + if ( ++deletionsInPile > maxDeletionsInPileup ) + return false; + continue; + } + + // create the GL holder object if this is the first time we're seeing a base for this sample + String readGroup = read.getAttribute("RG").toString(); // can't be null because those are filtered out + String sample = read.getHeader().getReadGroup(readGroup).getSample(); + PerSampleGenotypeLikelihoods myGLs = GLs.get(sample); + if ( myGLs == null ) { + myGLs = new PerSampleGenotypeLikelihoods(); + GLs.put(sample, myGLs); + } + + // assign the base to the appropriate strand + myGLs.addObservedBase(base, read, offset); + + // update the base counts + baseCounts[baseIndex]++; + totalObservedBases++; + } + + // optimization: if all bases are ref, return + if ( baseCounts[BaseUtils.simpleBaseToBaseIndex(ref)] == totalObservedBases ) + return true; + + // TODO -- Do we quit if there isn't X% of total samples represented in pileup? + + // Next, we need to create initial allele frequencies. + // An intelligent guess would be the observed base ratios (plus some small number to account for sampling issues). + + // TODO --- This needs to be broken out into a separate model as we might want to split up the calculation for each minor allele + + double[] alleleFrequencies = new double[4]; + for (int i = 0; i < 4; i++) + alleleFrequencies[i] = ((double)baseCounts[i] + DiploidGenotypePriors.HUMAN_HETEROZYGOSITY) / (double)totalObservedBases; + DiploidGenotypePriors AFPriors = calculateAlleleFreqBasedPriors(alleleFrequencies); + + for ( PerSampleGenotypeLikelihoods SGL : GLs.values() ) + SGL.createGLs(AFPriors); + + // debugging output + for (int i = 0; i < 4; i++) + logger.debug("Base count and initial allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + baseCounts[i] + ", " + alleleFrequencies[i]); + + + // The EM loop: + // we want to continue until the calculation is stable, but we need some max on the number of iterations + int iterations = 0; + // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC/N + double EM_STABILITY_VALUE = EM_STABILITY_METRIC / (2.0 * (double)GLs.size()); + boolean isStable = false; + + while ( !isStable && iterations < MAX_EM_ITERATIONS ) { + + // calculate the posterior-weighted allele frequencies and modify the priors accordingly + double[] newAlleleFrequencies = getPosteriorWeightedFrequencies(GLs.values()); + AFPriors = calculateAlleleFreqBasedPriors(newAlleleFrequencies); + for ( PerSampleGenotypeLikelihoods SGL : GLs.values() ) + SGL.setPriors(AFPriors); + + // determine whether we're stable + double AF_delta = 0.0; + for (int i = 0; i < 4; i++) { + AF_delta += Math.abs(alleleFrequencies[i] - newAlleleFrequencies[i]); + logger.debug("Previous allele frequency for " + BaseUtils.baseIndexToSimpleBase(i) + ": " + alleleFrequencies[i] + ", vs. new frequency: " + newAlleleFrequencies[i]); + } + + isStable = AF_delta < EM_STABILITY_VALUE; + iterations++; + + alleleFrequencies = newAlleleFrequencies; + } + + logger.debug("EM loop took " + iterations + " iterations"); + + if (true) + throw new RuntimeException("DEBUGGING"); + +/* + ClassicGenotypeLikelihoods[] G = em_result.genotype_likelihoods; + pD = Compute_pD(G, em_result.sample_weights); + pNull = Compute_pNull(contexts, em_result.sample_weights); + + // Apply p(f). + double pVar = 0.0; + for (int i = 1; i < em_result.EM_N; i++) { pVar += THETA/(double)i; } + + double p0 = Math.log10(1 - pVar); + double pF; + + double MAF = Compute_alt_freq(ref, em_result.allele_likelihoods); + + if (MAF < 1/(2.0*em_result.EM_N)) { pF = p0; } + else { pF = Math.log10(THETA/(2.0*em_result.EM_N * MAF)); } + + pD = pD + pF; + pNull = pNull + p0; + + lod = pD - pNull; + return lod; + +*/ + + // apply prior of theta / MAF + // It turns out that the appropriate prior is theta, for any sample size from 1 to ~1,000 + //return new SSGenotypeCall(context.getLocation(), ref,gl, pileup); + + return true; + } + + double[] getPosteriorWeightedFrequencies(Collection SGLs) { + double[] newAlleleLikelihoods = new double[4]; + + for ( PerSampleGenotypeLikelihoods SGL : SGLs ) { + + // calculate the posterior weighted frequencies for this sample + double[] personalAllelePosteriors = new double[4]; + double sum = 0; + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double posterior = Math.pow(10, SGL.totalGL.getPosterior(g)); + personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base1)] += posterior; + personalAllelePosteriors[BaseUtils.simpleBaseToBaseIndex(g.base2)] += posterior; + sum += 2.0 * posterior; + } + + // normalize + for (int i = 0; i < 4; i++) + personalAllelePosteriors[i] /= sum; + for (int i = 0; i < 4; i++) + newAlleleLikelihoods[i] += personalAllelePosteriors[i]; + } + + // normalize + double sum = 0; + for (int i = 0; i < 4; i++) + sum += newAlleleLikelihoods[i]; + for (int i = 0; i < 4; i++) + newAlleleLikelihoods[i] /= sum; + + return newAlleleLikelihoods; + } + + private DiploidGenotypePriors calculateAlleleFreqBasedPriors(double[] alleleFrequencies) { + // convert to log-space + double[] log10Freqs = new double[4]; + for (int i = 0; i < 4; i++) + log10Freqs[i] = Math.log10(alleleFrequencies[i]); + + double[] alleleFreqPriors = new double[10]; + + // this is the Hardy-Weinberg based allele frequency (p^2, q^2, 2pq) + for ( DiploidGenotype g : DiploidGenotype.values() ) { + alleleFreqPriors[g.ordinal()] = log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base1)] + log10Freqs[BaseUtils.simpleBaseToBaseIndex(g.base2)]; + // add a factor of 2 for the 2pq case + if ( g.isHet() ) + alleleFreqPriors[g.ordinal()] += Math.log10(2); + } + + return new DiploidGenotypePriors(alleleFreqPriors); + } + + private class PerSampleGenotypeLikelihoods { + + GenotypeLikelihoods forwardStrandGL; + GenotypeLikelihoods reverseStrandGL; + GenotypeLikelihoods totalGL; + + public PerSampleGenotypeLikelihoods() { + forwardStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform); + forwardStrandGL.setVerbose(VERBOSE); + reverseStrandGL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform); + reverseStrandGL.setVerbose(VERBOSE); + } + + public void addObservedBase(char base, SAMRecord read, int offset) { + if ( !read.getReadNegativeStrandFlag() ) + forwardStrandGL.add(base, read.getBaseQualities()[offset], read, offset); + else + reverseStrandGL.add(base, read.getBaseQualities()[offset], read, offset); + } + + public void createGLs(DiploidGenotypePriors priors) { + setPriors(priors); + + // GL_i = GL+_i + GL-_i + totalGL = GenotypeLikelihoods.combineLikelihoods(forwardStrandGL, reverseStrandGL); + totalGL.setVerbose(VERBOSE); + if ( VERBOSE ) + totalGL.validate(); + } + + public void setPriors(DiploidGenotypePriors priors) { + if ( forwardStrandGL != null ) + forwardStrandGL.setPriors(priors); + if ( reverseStrandGL != null ) + reverseStrandGL.setPriors(priors); + if ( totalGL != null ) + totalGL.setPriors(priors); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java index 29b8f4ca5..6027251f8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotypeCalculationModel.java @@ -4,38 +4,31 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.apache.log4j.Logger; import java.util.Set; public class SingleSampleGenotypeCalculationModel extends GenotypeCalculationModel { - private CallResult callsMetrics; + private CallResult callsMetrics = new CallResult(); - public SingleSampleGenotypeCalculationModel(BaseMismatchModel baseModel, - Set samples, - EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, - GenotypeWriter out, - boolean genotypeMode, - double lod, - int maxDeletions, - boolean verbose) { - super(baseModel, samples, platform, out, genotypeMode, lod, maxDeletions, verbose); + protected SingleSampleGenotypeCalculationModel() {} + + public void initialize(BaseMismatchModel baseModel, + Set samples, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform platform, + GenotypeWriter out, + Logger logger, + boolean genotypeMode, + double lod, + int maxDeletions, + boolean verbose) { + super.initialize(baseModel, samples, platform, out, logger, genotypeMode, lod, maxDeletions, verbose); // check that this truly is a single-sample bam if ( samples.size() > 1 ) throw new StingException("Single-sample genotype calculation model used on multi-sample bam... aborting"); - - callsMetrics = new CallResult(); - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); } public boolean calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 2db39342b..49b50928e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -95,7 +95,7 @@ public class UnifiedGenotyper extends LocusWalker { else writer = GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out); - gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer); + gcm = GenotypeCalculationModelFactory.makeGenotypeCalculation(UAC, samples, writer, logger); } /**