Fix all-bases-mode and genotype-mode in the UG and add integration tests for them.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2295 348d0f76-0448-11de-a6fe-93d51630548a
2009-12-09 17:41:30 +00:00 · 2009-12-09 17:41:30 +00:00 · 40c2d7a4bc
parent 4e54b91ce4
commit 40c2d7a4bc
5 changed files with 50 additions and 27 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
@ -44,9 +44,8 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode
        }

        // are we above the lod threshold for emitting calls (and not in all-bases mode)?
-        if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
-                return new Pair<VariationCall, List<Genotype>>(null, null);
-        }
+        if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
+            return new Pair<VariationCall, List<Genotype>>(null, null);

        // generate the calls
        List<Genotype> calls = genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts);
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java
@ -51,22 +51,18 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
        // if there are no non-ref bases...
        if ( bestAlternateAllele == null ) {
            // if we don't want all bases, then we can just return
-            if ( !ALL_BASE_MODE )
+            if ( !ALL_BASE_MODE && !GENOTYPE_MODE )
                return new Pair<VariationCall, List<Genotype>>(null, null);

-            // otherwise, we care about the ref base
-            bestAlternateAllele = ref;
-
-            // TODO -- figure out what to do here!
-
+            // otherwise, choose any alternate allele (it doesn't really matter)
+            bestAlternateAllele = (ref != 'A' ? 'A' : 'C');
        }
-        else {
-            initializeAlleleFrequencies(frequencyEstimationPoints);

-            initialize(ref, contexts, StratifiedContext.OVERALL);
-            calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
-            calculatePofFs(ref, frequencyEstimationPoints);
-        }
+        initializeAlleleFrequencies(frequencyEstimationPoints);
+
+        initialize(ref, contexts, StratifiedContext.OVERALL);
+        calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedContext.OVERALL);
+        calculatePofFs(ref, frequencyEstimationPoints);

        // print out stats if we have a writer
        if ( verboseWriter != null )
@ -311,19 +307,30 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
    protected List<Genotype> makeGenotypeCalls(char ref, char alt, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc) {
        // by default, we return no genotypes
        return new ArrayList<Genotype>();
-    }
+    }    

    protected Pair<VariationCall, List<Genotype>> createCalls(RefMetaDataTracker tracker, char ref, HashMap<String, AlignmentContextBySample> contexts, GenomeLoc loc, int frequencyEstimationPoints) {
        // only need to look at the most likely alternate allele
        int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele);

-        double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
-        if ( Double.isInfinite(phredScaledConfidence) )
-            phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
        int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]);
+        double phredScaledConfidence;
+        if ( bestAFguess != 0 ) {
+            phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]);
+            if ( Double.isInfinite(phredScaledConfidence) )
+                phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0];
+        } else {
+            phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]);
+            if ( Double.isInfinite(phredScaledConfidence) ) {
+                double sum = 0.0;
+                for (int i = 1; i < frequencyEstimationPoints; i++)
+                    sum += log10PofDgivenAFi[indexOfMax][i];
+                phredScaledConfidence = -10.0 * sum;
+            }
+        }

        // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
-        if ( !ALL_BASE_MODE && (bestAFguess == 0 || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
+        if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestAFguess == 0) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
            return new Pair<VariationCall, List<Genotype>>(null, null);

        // populate the sample-specific data
@ -333,8 +340,10 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc
        // *** note that calculating strand bias involves overwriting data structures, so we do that last
        VariationCall locusdata = GenotypeWriterFactory.createSupportedCall(OUTPUT_FORMAT, ref, loc, bestAFguess == 0 ? VARIANT_TYPE.REFERENCE : VARIANT_TYPE.SNP);
        if ( locusdata != null ) {
-            locusdata.addAlternateAllele(bestAlternateAllele.toString());
-            locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
+            if ( bestAFguess != 0 ) {
+                locusdata.addAlternateAllele(bestAlternateAllele.toString());
+                locusdata.setNonRefAlleleFrequency((double)bestAFguess / (double)(frequencyEstimationPoints-1));
+            }
            if ( locusdata instanceof ConfidenceBacked ) {
                ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
            }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
@ -67,9 +67,8 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
            }

            // are we above the lod threshold for emitting calls (and not in all-bases mode)?
-            if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
-                    return new Pair<VariationCall, List<Genotype>>(null, null);
-            }
+            if ( !ALL_BASE_MODE && ((!GENOTYPE_MODE && bestIsRef) || phredScaledConfidence < CONFIDENCE_THRESHOLD) )
+                return new Pair<VariationCall, List<Genotype>>(null, null);

            // we can now create the genotype call object
            GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation());
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -48,10 +48,10 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "variant_output_format", shortName = "vf", doc = "Format to be used to represent variants; default is VCF", required = false)
    public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.VCF;

-    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
+    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
    public boolean GENOTYPE = false;

-    @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output nonconfident variant or confident ref calls too?", required = false)
+    @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false)
    public boolean ALL_BASES = false;

    @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false)
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -101,6 +101,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
        executeTest("testSingleSamplePilot2 - Joint Estimate", spec);
    }

+    @Test
+    public void testGenotypeModeJoint() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T UnifiedGenotyper -genotype -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
+                Arrays.asList("6971e0bfa524c0e006b3c3ccef52520a"));
+        executeTest("testGenotypeMode - Joint Estimate", spec);
+    }
+
+    @Test
+    public void testAllBasesModeJoint() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T UnifiedGenotyper -all_bases -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1,
+                Arrays.asList("9f54482c1594bdd1e28b4cf2e51f944f"));
+        executeTest("testAllBasesMode - Joint Estimate", spec);
+    }
+
    //@Test
    //public void testGLF() {
    //    WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(