Major update to the Variant Optimizer. It now performs clustering for both the titv and titv-less models simultaneously, outputting the cluster files at every iteration. It makes use of the Jama matrix library to do full inverse and determinant calculation for the covariance matrix where before it was using only approximations.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3286 348d0f76-0448-11de-a6fe-93d51630548a
2010-05-02 19:21:23 +00:00 · 2010-05-02 19:21:23 +00:00 · 9d01670f62
parent a318b1871d
commit 9d01670f62
5 changed files with 493 additions and 470 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
@ -159,12 +159,17 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
                    final VariantDatum variantDatum = new VariantDatum();
                    variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
                    variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
-                    variantDatum.isHet = vc.getHetCount() > vc.getHomVarCount(); // BUGBUG: what to do here for multi sample calls?

-                    final double pTrue = theModel.evaluateVariant( vc.getAttributes(), vc.getPhredScaledQual(), variantDatum.isHet );
-                    final double recalQual = QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) );                             
+                    final double pTrue = theModel.evaluateVariant( vc.getAttributes(), vc.getPhredScaledQual() );
+                    double recalQual = QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) );
+                    if( !theModel.isUsingTiTvModel ) {
+                        recalQual *= 30.0;
+                    } else {
+                        recalQual *= 3.0;
+                    }
+                    // BUGBUG: decide how to scale the quality score

-                    if( variantDatum.isKnown && KNOWN_VAR_QUAL_PRIOR > 0.1 ) {
+                    if( variantDatum.isKnown && KNOWN_VAR_QUAL_PRIOR > 0.1 ) { // only use the known prior if the value is specified (meaning not equal to zero)
                        variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR;
                    } else {
                        variantDatum.qual = recalQual;
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java
@ -32,6 +32,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
 */

 public interface VariantClusteringModel extends VariantOptimizationInterface {
-    public void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster );
+    public void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFilename );
    //public void applyClusters( final VariantDatum[] data, final String outputPrefix );
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
@ -25,6 +25,7 @@

 package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;

+import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
 import org.broadinstitute.sting.utils.StingException;

@ -37,6 +38,9 @@ import java.io.PrintStream;
 */

 public class VariantDataManager {
+
+    protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
+
    public final VariantDatum[] data;
    public final int numVariants;
    public final int numAnnotations;
@ -56,7 +60,7 @@ public class VariantDataManager {
            meanVector = null;
            varianceVector = null;
        } else {
-            numAnnotations = _annotationKeys.size() + 1; // +1 for QUAL
+            numAnnotations = _annotationKeys.size();
            if( numAnnotations <= 0 ) {
                throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
            }
@ -91,11 +95,12 @@ public class VariantDataManager {
        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
            final double theMean = mean(data, jjj);
            final double theSTD = standardDeviation(data, theMean, jjj);
-            System.out.println( (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
+            logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
            if( theSTD < 1E-8 ) {
                foundZeroVarianceAnnotation = true;
-                System.out.println("Zero variance is a problem: standard deviation = " + theSTD);
-                System.out.println("User must -exclude annotations with zero variance. Annotation = " + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)));
+                logger.warn("Zero variance is a problem: standard deviation = " + theSTD + " User must -exclude annotations with zero variance. Annotation = " + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)));
+            } else if( theSTD < 1E-2 ) {
+                logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
            }
            meanVector[jjj] = theMean;
            varianceVector[jjj] = theSTD;
@ -129,7 +134,7 @@ public class VariantDataManager {

    public void printClusterFileHeader( PrintStream outputFile ) {
        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
-            outputFile.println("@!ANNOTATION," + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
+            outputFile.println("@!ANNOTATION," + annotationKeys.get(jjj) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
        }
    }
 }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
@ -55,8 +55,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    /////////////////////////////
    @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
    private double TARGET_TITV = 2.1;
-    @Argument(fullName="known_alpha_factor", shortName="kFactor", doc="Percentage of true positive rate that is due to difference between known and novel titv in a cluster as opposed to difference from target titv", required=false)
-    private double KNOWN_ALPHA_FACTOR = 0.0;
    @Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
    private boolean IGNORE_ALL_INPUT_FILTERS = false;
    @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@ -68,11 +66,11 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
    @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
    private String CLUSTER_FILENAME = "optimizer.cluster";
    @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
-    private int NUM_GAUSSIANS = 32;
+    private int NUM_GAUSSIANS = 26;
    @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
    private int NUM_ITERATIONS = 10;
    @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
-    private int MIN_VAR_IN_CLUSTER = 2000;
+    private int MIN_VAR_IN_CLUSTER = 1000;

    //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
    //private int NUM_KNN = 2000;
@ -122,6 +120,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
                    if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
                        annotationKeys.addAll( vc.getAttributes().keySet() );
+                        annotationKeys.add("QUAL");
                        if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?
                        if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); }
                        if( EXCLUDED_ANNOTATIONS != null ) {
@ -134,7 +133,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                                if( !annotationKeys.contains( forcedAnnotation ) ) { annotationKeys.add( forcedAnnotation ); }
                            }
                        }
-                        numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL")
+                        numAnnotations = annotationKeys.size();
                        annotationValues = new double[numAnnotations];
                        firstVariant = false;
                    }
@ -143,26 +142,28 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                    for( final String key : annotationKeys ) {

                        double value = 0.0;
-                        try {
-                            value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) );
-                            if( Double.isInfinite(value) ) {
-                                value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
+                        if( key.equals("AB") && !vc.getAttributes().containsKey(key) ) {
+                            value = (0.5 - 0.005) + (0.01 * Math.random()); // HomVar calls don't have an allele balance
+                        } else if( key.equals("QUAL") ) {
+                            value = vc.getPhredScaledQual();
+                        } else {
+                            try {
+                                value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) );
+                                if( Double.isInfinite(value) ) {
+                                    value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
+                                }
+                            } catch( NumberFormatException e ) {
+                                // do nothing, default value is 0.0
                            }
-                        } catch( NumberFormatException e ) {
-                            // do nothing, default value is 0.0
                        }
                        annotationValues[iii++] = value;
                    }

-                    // Variant quality ("QUAL") is not in the list of annotations, but is useful so add it here.
-                    annotationValues[iii] = vc.getPhredScaledQual();
-
                    final VariantDatum variantDatum = new VariantDatum();
                    variantDatum.annotations = annotationValues;
                    variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
                    variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
-                    variantDatum.isHet = vc.getHetCount() > vc.getHomVarCount(); // BUGBUG: what to do here for multi sample calls?
-                    
+
                    mapList.add( variantDatum );
                }
            }
@ -192,7 +193,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
        reduceSum.clear(); // Don't need this ever again, clean up some memory

        logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
-        logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
+        logger.info( "The annotations are: " + annotationKeys );

        dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
        
@ -200,7 +201,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
        VariantOptimizationModel theModel;
        switch (OPTIMIZATION_MODEL) {
            case GAUSSIAN_MIXTURE_MODEL:
-                theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, KNOWN_ALPHA_FACTOR );
+                theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
                break;
            //case K_NEAREST_NEIGHBORS:
            //    theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );