No longer allow the lod_threshold argument - use confidence instead.

Have UG output qscores in all cases. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1968 348d0f76-0448-11de-a6fe-93d51630548a
2009-11-03 16:18:51 +00:00 · 2009-11-03 16:18:51 +00:00 · 11d950abe0
parent 2fb45dbd73
commit 11d950abe0
6 changed files with 66 additions and 31 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java
@ -26,28 +26,46 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode

        // run the EM calculation
        EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL);
-        double lod = overall.getPofD() - overall.getPofNull();
-        logger.debug(String.format("LOD=%f", lod));

-        // return a null call if we don't pass the lod cutoff
-        if ( !ALL_BASE_MODE && lod < LOD_THRESHOLD )
-            return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
+        double PofD = Math.pow(10, overall.getPofD());
+        double PofNull = Math.pow(10, overall.getPofNull());
+        double sum = PofD + PofNull;
+
+        // calculate the phred-scaled confidence score
+        double phredScaledConfidence;
+        boolean bestIsRef = false;
+        if ( PofD > PofNull ) {
+            phredScaledConfidence = -10.0 * Math.log10(PofNull / sum);
+        } else {
+            phredScaledConfidence = -10.0 * Math.log10(PofD / sum);
+            bestIsRef = true;
+        }
+
+        // are we above the lod threshold for emitting calls (and not in all-bases mode)?
+        if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
+                return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
+        }

        // generate the calls
        GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation());
        if ( locusdata != null ) {
            if ( locusdata instanceof ConfidenceBacked ) {
-                ((ConfidenceBacked)locusdata).setConfidence(lod);
+                ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
            }
            if ( locusdata instanceof SLODBacked ) {
+
                // calculate strand score
+
+                double lod = overall.getPofD() - overall.getPofNull();
+                logger.debug(String.format("LOD=%f", lod));
+
                EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD);
                EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE);
                double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull();
                double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull();
                logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod);
                double strandScore = Math.max(forwardLod - lod, reverseLod - lod);
-                logger.debug(String.format("SLOD=%f", lod, strandScore));
+                logger.debug(String.format("SLOD=%f", strandScore));

                ((SLODBacked)locusdata).setSLOD(strandScore);
            }
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java
@ -35,7 +35,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
    protected boolean GENOTYPE_MODE;
    protected boolean POOLED_INPUT;
    protected int POOL_SIZE;
-    protected double LOD_THRESHOLD;
    protected double CONFIDENCE_THRESHOLD;
    protected double MINIMUM_ALLELE_FREQUENCY;
    protected int maxDeletionsInPileup;
@ -68,7 +67,6 @@ public abstract class GenotypeCalculationModel implements Cloneable {
        GENOTYPE_MODE = UAC.GENOTYPE;
        POOLED_INPUT = UAC.POOLED;
        POOL_SIZE = UAC.POOLSIZE;
-        LOD_THRESHOLD = UAC.LOD_THRESHOLD;
        CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD;
        MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY;
        maxDeletionsInPileup = UAC.MAX_DELETIONS;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java
@ -45,16 +45,27 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
            // get the genotype likelihoods
            Pair<ReadBackedPileup, GenotypeLikelihoods> discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL);

-            // calculate the lod threshold
-            double[] posteriors = discoveryGL.second.getPosteriors();
+            // find the index of the best genotype
+            double[] posteriors = discoveryGL.second.getNormalizedPosteriors();
            Integer sortedPosteriors[] = Utils.SortPermutation(posteriors);
-            double bestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 1]];
-            double nextBestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 2]];
-            double refGenotype = posteriors[DiploidGenotype.createHomGenotype(ref).ordinal()];
-            double lodConfidence = (GENOTYPE_MODE ? (bestGenotype - nextBestGenotype) : (bestGenotype - refGenotype));
+            int bestIndex = sortedPosteriors[sortedPosteriors.length - 1];
+
+            // flag to determine if ref is the best call (not necessary in genotype mode)
+            boolean bestIsRef = false;
+
+            // calculate the phred-scaled confidence score
+            double phredScaledConfidence;
+            if ( GENOTYPE_MODE ) {
+                phredScaledConfidence = -10.0 * Math.log10(1.0 - posteriors[bestIndex]);
+            } else {
+                int refIndex = DiploidGenotype.createHomGenotype(ref).ordinal();
+                bestIsRef = (refIndex == bestIndex);
+                double pError = (bestIsRef ? 1.0 - posteriors[refIndex] : posteriors[refIndex]);
+                phredScaledConfidence = -10.0 * Math.log10(pError);
+            }

            // are we above the lod threshold for emitting calls (and not in all-bases mode)?
-            if ( !ALL_BASE_MODE && lodConfidence < LOD_THRESHOLD ) {
+            if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) {
                    return new Pair<List<Genotype>, GenotypeLocusData>(null, null);
            }

@ -77,7 +88,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation
            GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation());
            if ( locusdata != null ) {
                if ( locusdata instanceof ConfidenceBacked ) {
-                    ((ConfidenceBacked)locusdata).setConfidence(lodConfidence);
+                    ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence);
                }
            }

--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -71,11 +71,11 @@ public class UnifiedArgumentCollection {


    // control the various parameters to be used
-    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false)
+    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "[DEPRECATED] The lod threshold by which variants should be filtered", required = false)
    public double LOD_THRESHOLD = Double.MIN_VALUE;

-    @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false)
-    public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE;
+    @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered", required = false)
+    public double CONFIDENCE_THRESHOLD = 0.0;

    @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false)
    public Integer MAX_DELETIONS = 1;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -100,9 +100,17 @@ public class UnifiedGenotyper extends LocusWalker<Pair<List<Genotype>, GenotypeL
     **/
    public void initialize() {

+        // deal with input errors
        if ( UAC.POOLED && UAC.genotypeModel == GenotypeCalculationModel.Model.EM_POINT_ESTIMATE ) {
            throw new StingException("This was an attempt to use an EM Point Estimate model with pooled genotype calculations. This model does not work with pooled data.");
        }
+        if ( UAC.LOD_THRESHOLD > Double.MIN_VALUE ) {
+            StringBuilder sb = new StringBuilder();
+            sb.append("\n***\tThe --lod_threshold argument is no longer supported; instead, please use --min_confidence_threshold.");
+            sb.append("\n***\tThere is approximately a 10-to-1 mapping from confidence to LOD.");
+            sb.append("\n***\tUse Q" + (10.0 * UAC.LOD_THRESHOLD) + " as an approximate equivalent to your LOD " + UAC.LOD_THRESHOLD + " cutoff");
+            throw new StingException(sb.toString());
+        }

        // get all of the unique sample names
        samples = new HashSet<String>();
--- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java
@ -13,7 +13,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    }

    public static String testGeliLod5() {
-        return baseTestString() + " --variant_output_format GELI -lod 5";
+        return baseTestString() + " --variant_output_format GELI -confidence 50";
    }

    private static String OneMb1StateMD5 = "7e3fc1d8427329eb2a3e05a81011749a";
@ -42,16 +42,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    @Test
    public void testMultiSamplePilot1() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
-                "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
-                Arrays.asList("dba7f51b0ddb1afebcd8f229444f70e5"));
+                "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
+                Arrays.asList("9985a92060b512b5d27b4074faf8b60b"));
        executeTest("testMultiSamplePilot1", spec);
    }

    @Test
    public void testMultiSamplePilot2() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
-                "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1,
-                Arrays.asList("c2a7dd9bec6819b2d7702564955c993b"));
+                "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1,
+                Arrays.asList("394f009c9bad34eb584fa10d133d79e0"));
        executeTest("testMultiSamplePilot2", spec);
    }

@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    public void genotypeTest() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                testGeliLod5() + " -L 1:10,000,000-10,100,000 -bm empirical --genotype", 1,
-                Arrays.asList("6a930a8f115495f5319541c21a892d8e"));
+                Arrays.asList("45da29e3b1306d546a7b80c30c979ad4"));
        executeTest("genotypeTest", spec);
    }

@ -167,12 +167,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    @Test
    public void testLOD() {
        HashMap<Double, String> e = new HashMap<Double, String>();
-        e.put( 10.0, "94c5b48c0c956fcdacbffaa38a80d926" );
-        e.put( 3.0, "f1f53b944b821f05d5dd1ad8fda2f57b" );
+        e.put( 100.0, "94c5b48c0c956fcdacbffaa38a80d926" );
+        e.put( 30.0, "df455abc1b2bc533aa1dc6eb088a835a" );

        for ( Map.Entry<Double, String> entry : e.entrySet() ) {
            WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
-                    baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -lod " + entry.getKey(), 1,
+                    baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -confidence " + entry.getKey(), 1,
                    Arrays.asList(entry.getValue()));
            executeTest("testLOD", spec);
        }
@ -186,7 +186,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    @Test
    public void testHeterozyosity() {
        HashMap<Double, String> e = new HashMap<Double, String>();
-        e.put( 0.01, "953fe510a81e357d0b56feac2a90095d" );
+        e.put( 0.01, "b8837be7e8beb3ab2ed7150cdc022c65" );
        e.put( 0.0001, "ef0f2af7d13f166829d86b15fabc2b81" );
        e.put( 1.0 / 1850, "a435c8c966c11f4393a25a9d01c4fc3d" );

@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
    @Test
    public void empirical1MbTestBinaryGeli() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
-                baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -lod 5", 1,
+                baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -confidence 50", 1,
                Arrays.asList("915abf82a04fcd1842f6865501bae67c"));
        executeTest("empirical1MbTestBinaryGeli", spec);
    }