Joint Estimation model now emits calls in all formats.

The whole GenotypeCall framework needs to be changed, but this will work for the time being. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1902 348d0f76-0448-11de-a6fe-93d51630548a
2009-10-23 03:07:28 +00:00 · 2009-10-23 03:07:28 +00:00 · 6c338eccb8
parent a6dc8cd44e
commit 6c338eccb8
3 changed files with 75 additions and 16 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -30,10 +30,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected Logger logger;
    protected double heterozygosity;
    protected EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform;
    protected boolean ALL_BASE_MODE;
    protected boolean GENOTYPE_MODE;
    protected boolean POOLED_INPUT;
    protected int POOL_SIZE;
    protected double LOD_THRESHOLD;
    protected double CONFIDENCE_THRESHOLD;
    protected double MINIMUM_ALLELE_FREQUENCY;
    protected int maxDeletionsInPileup;
    protected String assumedSingleSample;
@ -60,10 +62,12 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        baseModel = UAC.baseModel;
        heterozygosity = UAC.heterozygosity;
        defaultPlatform = UAC.defaultPlatform;
        ALL_BASE_MODE = UAC.ALL_BASES;
        GENOTYPE_MODE = UAC.GENOTYPE;
        POOLED_INPUT = UAC.POOLED;
        POOL_SIZE = UAC.POOLSIZE;
        LOD_THRESHOLD = UAC.LOD_THRESHOLD;
        CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
        MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
        maxDeletionsInPileup = UAC.MAX_DELETIONS;
        assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
@ -28,7 +28,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
    private double[] PofFs = new double[BaseUtils.BASES.length];
    // the minimum and actual number of points in our allele frequency estimation
-    private static final int MIN_ESTIMATION_POINTS = 100; //1000;
+    private static final int MIN_ESTIMATION_POINTS = 100;
    private int frequencyEstimationPoints;
    // the GenotypeLikelihoods map
@ -49,7 +49,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        // run joint estimation for the full GL contexts
        initializeGenotypeLikelihoods(ref, contexts, StratifiedContext.OVERALL);
        calculateAlleleFrequencyPosteriors(ref, context.getLocation());
-
+        return createCalls(ref, contexts);
        //double lod = overall.getPofD() - overall.getPofNull();
        //logger.debug("lod=" + lod);
@ -67,7 +67,6 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
        // generate the calls
        //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF());
        //return new Pair<List<GenotypeCall>, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata);
        return null;
    }
    private void initializeAlleleFrequencies(int numSamples) {
@ -232,7 +231,7 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
    private void normalizeFromLog10(double[] array) {
        // for precision purposes, we need to add (or really subtract, since they're
        // all negative) the largest value; also, we need to convert to normal-space.
-        double maxValue = findMaxEntry(array);
+        double maxValue = findMaxEntry(array).first;
        for (int i = 0; i < array.length; i++)
            array[i] = Math.pow(10, array[i] - maxValue);
@ -244,13 +243,17 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
            array[i] /= sum;
    }
-    private static double findMaxEntry(double[] array) {
+    // returns the maximum value in the array and its index
    private static Pair<Double, Integer> findMaxEntry(double[] array) {
        int index = 0;
        double max = array[0];
-        for (int index = 1; index < array.length; index++) {
+        for (int i = 1; i < array.length; i++) {
-            if ( array[index] > max )
+            if ( array[i] > max ) {
-                max = array[index];
+                max = array[i];
                index = i;
            }
        }
-        return max;
+        return new Pair<Double, Integer>(max, index);
    }
    private void printAlleleFrequencyData(char ref, GenomeLoc loc) {
@ -282,9 +285,55 @@ public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationMo
                char base = Character.toLowerCase(altAllele);
                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
                verboseWriter.println("P(f>0)_" + base + " = " + Math.log10(PofFs[baseIndex]));
                verboseWriter.println("Qscore_" + base + " = " + (-10.0 * Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
                verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0])));
            }
        }
        verboseWriter.println();
    }
    private Pair<List<GenotypeCall>, GenotypeMetaData> createCalls(char ref, HashMap<String, AlignmentContextBySample> contexts) {
        // first, find the alt allele with maximum confidence
        int indexOfMax = 0;
        char baseOfMax = ref;
        double maxConfidence = Double.MIN_VALUE;
        for ( char altAllele : BaseUtils.BASES ) {
            if ( altAllele != ref ) {
                int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele);
                if ( PofFs[baseIndex] > maxConfidence ) {
                    indexOfMax = baseIndex;
                    baseOfMax = altAllele;
                    maxConfidence = PofFs[baseIndex];
                }
            }
        }
        double phredScaledConfidence = -10.0 * Math.log10(alleleFrequencyPosteriors[indexOfMax][0]);
        double bestAFguess = (double)findMaxEntry(alleleFrequencyPosteriors[indexOfMax]).second / (double)(frequencyEstimationPoints-1);
        ArrayList<GenotypeCall> calls = new ArrayList<GenotypeCall>();
        // TODO -- generate strand score
        double strandScore = 0.0;
        GenotypeMetaData metadata = new GenotypeMetaData(phredScaledConfidence, strandScore, bestAFguess);
        // generate calls only if we pass the threshold or we want ref calls
        if ( phredScaledConfidence >= CONFIDENCE_THRESHOLD || ALL_BASE_MODE ) {
            for ( String sample : GLs.keySet() ) {
                // get the pileup
                AlignmentContext context = contexts.get(sample).getContext(StratifiedContext.OVERALL);
                ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
                pileup.setIncludeDeletionsInPileupString(true);
                // TODO -- fix GenotypeCall code so that each call doesn't need its own pileup
                // create the call
                GenotypeCall call = new GenotypeCall(sample, context.getLocation(), ref, GLs.get(sample), pileup);
                calls.add(call);
                // TODO -- fix GenotypeCall code so that UG tells it which genotypes to use
                // TODO -- all of the intelligence for making calls should be in UG
            }
        }
        return new Pair<List<GenotypeCall>, GenotypeMetaData>(calls, metadata);
    }
 }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -51,22 +51,28 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
    public boolean GENOTYPE = false;
-    @Argument(fullName = "verboseMode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
+    @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
    public boolean ALL_BASES = false;
    @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
    public String VERBOSE = null;
    // control the error modes
-    @Argument(fullName = "assumeSingleSampleReads", shortName = "singleSample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
+    @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
    public String ASSUME_SINGLE_SAMPLE = null;
    // control the various parameters to be used
    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
    public double LOD_THRESHOLD = Double.MIN_VALUE;
    @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform.  Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
    public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
    // control the various parameters to be used
    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
    public double LOD_THRESHOLD = Double.MIN_VALUE;
    @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
    public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
    @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
    public Integer MAX_DELETIONS = 1;