Checking in everyone's changes to the variant recalibrator. We now calculate the variant quality score as a LOD score between the true and false hypothesis. Allele Count prior is changed to be (1 - 0.5^ac). Known prior breaks out HapMap sites

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3952 348d0f76-0448-11de-a6fe-93d51630548a
2010-08-05 14:12:19 +00:00 · 2010-08-05 14:12:19 +00:00 · a8d37da10b
parent 07addf1187
commit a8d37da10b
5 changed files with 60 additions and 28 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
@ -67,7 +67,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
-    private int MAX_GAUSSIANS = 30;
+    private int MAX_GAUSSIANS = 4;
    @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
    private int MAX_ITERATIONS = 200;
    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@ -87,9 +87,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    @Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
    private boolean FORCE_INDEPENDENT = false;
    @Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
-    private double STD_THRESHOLD = 6.0;
+    private double STD_THRESHOLD = 4.5;
    @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
-    private double QUAL_THRESHOLD = 300.0;
+    private double QUAL_THRESHOLD = 1000.0;
    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
    private double SHRINKAGE = 0.0001;
    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
@ -330,7 +330,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        if ( this.singletonFPRate == -1 )
            return alleleCountFactorArray[alleleCount];
        else {
-            return 1 - Math.pow(singletonFPRate, alleleCount);
+            return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
        }
    }

@ -345,7 +345,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
            }
        }

-        for( int ttt = 0; ttt < 20; ttt++ ) {
+        for( int ttt = 0; ttt < 60; ttt++ ) {
            performKMeansIteration( data );
        }
    }
@ -528,12 +528,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        return value;
    }

-    public final double evaluateVariantLog10( final VariantContext vc ) {
+    public final double evaluateVariant( final VariantContext vc ) {
        final double[] pVarInCluster = new double[maxGaussians];
        final double[] annotations = new double[dataManager.numAnnotations];

        for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
-            final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, true );
+            final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, false );
            annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
        }

@ -544,12 +544,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
            sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
        }

-        double sumLog10 = Math.log10(sum);
-        if( Double.isInfinite(sumLog10) || Double.isNaN(sumLog10) ) {
-            //logger.warn("pTrueLog10 = -Infinity, capped at -20");
-            sumLog10 = -20;
-        }
-        return sumLog10;
+        return sum;
    }

   public final void outputClusterReports( final String outputPrefix ) {
@ -839,22 +834,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel

        final int numAnnotations = annotations.length;
        final double mult[] = new double[numAnnotations];
+
+        final double evalGaussianPDFLog10[] = new double[maxGaussians];
        for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
            final double sigmaVals[][] = sigmaInverse[kkk].getArray();
            double sum = 0.0;
            for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
-                mult[jjj] = 0.0;
+                double value = 0.0;
                for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
-                    mult[jjj] += (annotations[ppp] - mu[kkk][ppp]) * sigmaVals[ppp][jjj];
+                    final double myMu = mu[kkk][ppp];
+                    final double myAnn = annotations[ppp];
+                    final double mySigma = sigmaVals[ppp][jjj];
+                    value += (myAnn - myMu) * mySigma;
                }
+                mult[jjj] = value;
            }
            for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
                sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
            }

            final double denomLog10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5));
-            pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + (( -0.5 * sum ) / Math.log(10.0)) - denomLog10);
+            evalGaussianPDFLog10[kkk] = (( -0.5 * sum ) / Math.log(10.0)) - denomLog10;
+            pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + evalGaussianPDFLog10[kkk]);
        }
+
    }


--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@ -69,12 +69,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private boolean IGNORE_ALL_INPUT_FILTERS = false;
    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
    private String[] IGNORE_INPUT_FILTERS = null;
+    @Argument(fullName="hapmap_prior", shortName="hapmapPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
+    private double HAPMAP_QUAL_PRIOR = 15.0;
    @Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
-    private int KNOWN_QUAL_PRIOR = 9;
+    private double KNOWN_QUAL_PRIOR = 10.0;
    @Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
-    private int NOVEL_QUAL_PRIOR = 2;
-    @Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
-    private double QUALITY_SCALE_FACTOR = 1600.0;
+    private double NOVEL_QUAL_PRIOR = 2.0;
    @Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
    private String OUTPUT_PREFIX = "optimizer";
    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
@ -90,7 +90,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    @Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
    private double QUAL_STEP = 0.1;
    @Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
-    private double SINGLETON_FP_RATE = -1;
+    private double SINGLETON_FP_RATE = 0.5;

    /////////////////////////////
    // Private Member Variables
@ -98,6 +98,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    private VariantGaussianMixtureModel theModel = null;
    private VCFWriter vcfWriter;
    private Set<String> ignoreInputFilterSet = null;
+    private int numUnstable = 0;

    //---------------------------------------------------------------------------------------------------------------
    //
@ -185,11 +186,27 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
                    variantDatum.isKnown = dbsnp != null;
                    variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?

-                    final double acPriorLog10 =  Math.log10(theModel.getAlleleCountPrior( variantDatum.alleleCount ));
-                    final double knownPriorLog10 = Math.log10( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
-                    final double pTrue = Math.pow(10.0, theModel.evaluateVariantLog10( vc ) + acPriorLog10 + knownPriorLog10);
+                    final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
+                    double knownPrior = variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR);
+                    if(variantDatum.isKnown && DbSNPHelper.isHapmap( dbsnp ) ) {
+                        knownPrior = QualityUtils.qualToProb(HAPMAP_QUAL_PRIOR);
+                    }
+                    final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
+
+                    if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
+                        throw new StingException("Some is wrong with the prior that was entered by the user:  Prior = " + totalPrior); // TODO - fix this up later
+                    }
+
+                    double pVar = theModel.evaluateVariant( vc );
+
+                    if( pVar > 1.0 ) {
+                        pVar = 0.99;
+                        numUnstable++;
+                    }
+
+                    final double lod = (Math.log10(totalPrior) + Math.log10(pVar)) - ((Math.log10(1.0 - totalPrior)) + Math.log10(1.0 - pVar));
+                    variantDatum.qual = 10.0 * QualityUtils.lodToPhredScaleErrorRate(lod);

-                    variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
                    mapList.add( variantDatum );

                    Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
@ -228,6 +245,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {

        vcfWriter.close();
+
+        if( numUnstable > 0 ) {
+            logger.warn("WARNING: Found " + numUnstable + " variant(s) with pVar > 1, Most likely numerical instability during clustering     !!!!!");
+        }
        
        final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
        reduceSum.clear(); // Don't need this ever again, clean up some memory
--- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java
@ -30,7 +30,11 @@ public class QualityUtils {
    }

    static public double qualToProb(int qual) {
-        return 1.0 - Math.pow(10.0, ((double) qual)/-10.0);
+        return qualToProb( (double)qual );
+    }
+
+    static public double qualToProb(double qual) {
+        return 1.0 - Math.pow(10.0, qual/(-10.0));
    }

    /**
@ -76,6 +80,10 @@ public class QualityUtils {
    static public double phredScaleErrorRate(double errorRate) {
        return -10.0*Math.log10(errorRate);
    }
+
+    static public double lodToPhredScaleErrorRate(double lod) {
+        return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0));
+    }
    
    /**
     * Return a quality score, capped at max qual.
--- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    @Test
    public void testGenerateVariantClusters() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "72558d2c49bb94dc59e9d4146fe0bc05" );
+        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" );

        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            String vcf = entry.getKey();
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    @Test
    public void testVariantRecalibrator() {
        HashMap<String, String> e = new HashMap<String, String>();
-        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "0cb94385ced8a7a537d7bc79f82c01d3" );
+        e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "4570c93c96de61342d75c1c658e2d5c1" );
        
        for ( Map.Entry<String, String> entry : e.entrySet() ) {
            String vcf = entry.getKey();