Joint Estimation model now emits calls in all formats.

The whole GenotypeCall framework needs to be changed, but this will work for the time being. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1902 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-23 03:07:28 +00:00 · 2009-10-23 03:07:28 +00:00 · 6c338eccb8
parent a6dc8cd44e
commit 6c338eccb8
3 changed files with 75 additions and 16 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected Logger logger;
    protected double heterozygosity;
    protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
+    protected boolean ALL_BASE_MODE;
    protected boolean GENOTYPE_MODE;
    protected boolean POOLED_INPUT;
    protected int POOL_SIZE;
    protected double LOD_THRESHOLD;
+    protected double CONFIDENCE_THRESHOLD;
    protected double MINIMUM_ALLELE_FREQUENCY;
    protected int maxDeletionsInPileup;
    protected String assumedSingleSample;
@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        baseModel = UAC.baseModel;
        heterozygosity = UAC.heterozygosity;
        defaultPlatform = UAC.defaultPlatform;
+        ALL_BASE_MODE = UAC.ALL_BASES;
        GENOTYPE_MODE = UAC.GENOTYPE;
        POOLED_INPUT = UAC.POOLED;
        POOL_SIZE = UAC.POOLSIZE;
        LOD_THRESHOLD = UAC.LOD_THRESHOLD;
+        CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
        MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
        maxDeletionsInPileup = UAC.MAX_DELETIONS;
        assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
    private double[] PofFs = new double[BaseUtils.BASES.length];

    // the minimum and actual number of points in our allele frequency estimation
-    private static final int MIN_ESTIMATION_POINTS = 100; //1000;
+    private static final int MIN_ESTIMATION_POINTS = 100;
    private int frequencyEstimationPoints;

    // the GenotypeLikelihoods map
@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        // run joint estimation for the full GL contexts
        initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
        calculateAlleleFrequencyPosteriors(ref, context.getLocation());
-
+        return createCalls(ref, contexts);

        //double lod = overall.getPofD() - overall.getPofNull();
        //logger.debug("lod=" + lod);
@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        // generate the calls
        //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
        //return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
-        return null;
    }

    private void initializeAlleleFrequencies(int numSamples) {
@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
    private void normalizeFromLog10(double[] array) {
        // for precision purposes, we need to add (or really subtract, since they're
        // all negative) the largest value; also, we need to convert to normal-space.
-        double maxValue = findMaxEntry(array);
+        double maxValue = findMaxEntry(array).first;
        for (int i = 0; i < array.length; i++)
            array[i] = Math.pow(10, array[i] - maxValue);

@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
            array[i] /= sum;
    }

-    private static double findMaxEntry(double[] array) {
+    // returns the maximum value in the array and its index
+    private static Pair<Double, Integer> findMaxEntry(double[] array) {
+        int index = 0;
        double max = array[0];
-        for (int index = 1; index < array.length; index++) {
-            if ( array[index] > max )
-                max = array[index];
+        for (int i = 1; i < array.length; i++) {
+            if ( array[i] > max ) {
+                max = array[i];
+                index = i;
+            }
        }
-        return max;
+        return new Pair<Double, Integer>(max, index);
    }

    private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
                char base = Character.toLowerCase(altAllele);
                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
                verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
+                verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
                verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
            }
        }
        verboseWriter.println();
    }
+
+    private Pair<List<GenotypeCall>, GenotypeMetaData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts) {
+        // first, find the alt allele with maximum confidence
+        int indexOfMax = 0;
+        char baseOfMax = ref;
+        double maxConfidence = Double.MIN_VALUE;
+        for ( char altAllele : BaseUtils.BASES ) {
+            if ( altAllele != ref ) {
+                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
+                if ( PofFs[baseIndex] > maxConfidence ) {
+                    indexOfMax = baseIndex;
+                    baseOfMax = altAllele;
+                    maxConfidence = PofFs[baseIndex];
+                }
+            }
+        }
+        double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
+        double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1);
+
+        ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
+        // TODO -- generate strand score
+        double strandScore = 0.0;
+        GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess);
+
+        // generate calls only if we pass the threshold or we want ref calls
+        if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) {
+
+            for ( String sample : GLs.keySet() ) {
+                // get the pileup
+                AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
+                ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
+                pileup.setIncludeDeletionsInPileupString(true);
+                // TODO -- fix GenotypeCall code so that each call doesn't need its own pileup
+
+                // create the call
+                GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
+                calls.add(call);
+
+                // TODO -- fix GenotypeCall code so that UG tells it which genotypes to use
+                // TODO -- all of the intelligence for making calls should be in UG
+            }
+        }
+
+        return new Pair<List<GenotypeCall>, GenotypeMetaData>(calls, metadata);
+    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -51,22 +51,28 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
    public boolean GENOTYPE = false;

-    @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
+    @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
+    public boolean ALL_BASES = false;
+
+    @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
    public String VERBOSE = null;


    // control the error modes
-    @Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
+    @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
    public String ASSUME_SINGLE_SAMPLE = null;

-
-    // control the various parameters to be used
-    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
-    public double LOD_THRESHOLD = Double.MIN_VALUE;
-
    @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform.  Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
    public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;

+
+    // control the various parameters to be used
+    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
+    public double LOD_THRESHOLD = Double.MIN_VALUE;
+
+    @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
+    public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
+
    @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
    public Integer MAX_DELETIONS = 1;