Minor bug fix in k-means implementation. Updating VQSR integration tests in preparation for VQSRv2 by removing some unused features such as VariantDatum.weight and ti/tv cutting.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5410 348d0f76-0448-11de-a6fe-93d51630548a
2011-03-09 00:26:28 +00:00 · 2011-03-09 00:26:28 +00:00 · 509daac9f7
parent fa7284b7a1
commit 509daac9f7
4 changed files with 75 additions and 20 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java
@ -87,7 +87,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
    private double QUAL_THRESHOLD = 100.0;
    @Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
    private double SHRINKAGE = 0.0001;
-    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
+    @Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
    private double DIRICHLET_PARAMETER = 1000.0;

    /////////////////////////////
--- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java
@ -224,7 +224,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        empiricalMu = new double[numAnnotations];
        final double[][] sigmaVals = new double[numAnnotations][numAnnotations];

-
        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
            empiricalMu[jjj] = 0.0;
            for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
@ -268,7 +267,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        final int numAnnotations = data[0].annotations.length;

        for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
-            mu[kkk] = data[rand.nextInt(numVariants)].annotations;
+            //mu[kkk] = data[rand.nextInt(numVariants)].annotations;
+            mu[kkk] = new double[numAnnotations];
            for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
                mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
            }
@ -310,10 +310,13 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
            int numAssigned = 0;

            for( int iii = 0; iii < numVariants; iii++ ) {
-                if(assignment[iii] == kkk) {
-                    numAssigned++;
-                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
-                        mu[kkk][jjj] += data[iii].annotations[jjj];
+                final VariantDatum datum = data[iii];
+                if(datum.weight > 0.0) {
+                    if(assignment[iii] == kkk) {
+                        numAssigned++;
+                        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                            mu[kkk][jjj] += data[iii].annotations[jjj];
+                        }
                    }
                }
            }
@ -924,6 +927,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
        return Math.log(x) - ( 1.0 / (2.0 * x) )
                           - ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
                           + ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
-                           - ( 1.0 / (252.0 * Math.pow(x, 6.0)) ); 
+                           - ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
    }
 }
--- a/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java
@ -371,6 +371,14 @@ public class MathUtils {
        return Math.sqrt(rms);
    }

+    public static double distanceSquared( final double[] x, final double[] y ) {
+        double dist = 0.0;
+        for(int iii = 0; iii < x.length; iii++) {
+            dist += (x[iii] - y[iii]) * (x[iii] - y[iii]);
+        }
+        return dist;
+    }
+
    /**
     * normalizes the log10-based array.  ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
     *
@ -401,6 +409,27 @@ public class MathUtils {
        return normalized;
    }

+    public static double[] normalizeFromLog10(List<Double> array, boolean takeLog10OfOutput) {
+        double[] normalized = new double[array.size()];
+
+        // for precision purposes, we need to add (or really subtract, since they're
+        // all negative) the largest value; also, we need to convert to normal-space.
+        double maxValue = MathUtils.arrayMax( array );
+        for (int i = 0; i < array.size(); i++)
+            normalized[i] = Math.pow(10, array.get(i) - maxValue);
+
+        // normalize
+        double sum = 0.0;
+        for (int i = 0; i < array.size(); i++)
+            sum += normalized[i];
+        for (int i = 0; i < array.size(); i++) {
+            double x = normalized[i] / sum;
+            if ( takeLog10OfOutput ) x = Math.log10(x);
+            normalized[i] = x;
+        }
+
+        return normalized;
+    }
    /**
     * normalizes the log10-based array.  ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
     *
@ -412,6 +441,10 @@ public class MathUtils {
        return normalizeFromLog10(array, false);
    }

+    public static double[] normalizeFromLog10(List<Double> array) {
+        return normalizeFromLog10(array, false);
+    }
+
    public static int maxElementIndex(double[] array) {
        if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");

@ -469,6 +502,15 @@ public class MathUtils {
        return m;
    }

+    public static double arrayMax(List<Double> array) {
+        if ( array == null ) throw new IllegalArgumentException("Array cannot be null!");
+        if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!");
+
+        double m = array.get(0);
+        for ( double e : array ) m = Math.max(m, e);
+        return m;
+    }
+
    public static double average(List<Long> vals, int maxI) {
        long sum = 0L;

@ -814,6 +856,15 @@ public class MathUtils {
        return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.));
    }

+    // from http://en.wikipedia.org/wiki/Digamma_function
+    // According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by:
+    public static double diGamma(final double x) {
+        return Math.log(x) - ( 1.0 / (2.0 * x) )
+                           - ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
+                           + ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
+                           - ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
+    }
+
    /** A utility class that computes on the fly average and standard deviation for a stream of numbers.
     * The number of observations does not have to be known in advance, and can be also very big (so that
     * it could overflow any naive summation-based scheme or cause loss of precision).
--- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java
@ -29,16 +29,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
    }

    VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
-            "4eeffa7a1965ce0c25c5edd0bae76290",  // in vcf
-            "a04e00d00c991d76900634376bc1a9d1",  // tranches
-            "f34b36c1da8bcb080a584592d1f6dae7",  // recalVCF
-            "ee07ff6c0ff6108e00b131ce3c7a7f65"); // cut VCF
+            "e14079c3a02c112665a6c194fd4f5d5c",  // cluster file
+            "dce581b880ffb6ea39cbada1ecc95915",  // tranches
+            "c3e8a2f43656eab7d847dbf850f844a6",  // recalVCF
+            "50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF

    VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
-            "8937a3ae7f176dacf47b8ee6c0023416",  // in vcf
-            "8da48dec888c9d4e3c3c7ed7943c0c61",  // tranches
-            "ae6a1e0874c966312e891b5a3c47b0e3",  // recalVCF
-            "3ac16abdc5bbd5148f0cf88e8a7af3c9"); // cut VCF
+            "b0c0f8c8d9fe3d1ed2bde0b1eb82a22d",  // cluster file
+            "66edae83c50f4e8601fef7fafba774af",  // tranches
+            "0123537e373657386068a534c0f5c91b",  // recalVCF
+            "2172368e8585841e5ad96c95d0827c4b"); // cut VCF

    @DataProvider(name = "VRTest")
    public Object[][] createData1() {
@ -51,14 +51,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
                "-R " + b36KGReference +
                        " -NO_HEADER" +
                        " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
-                        " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" +
-                        " -weightDBSNP 0.2 -weightHapMap 1.0" +
+                        " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
+                        " -weightDBSNP 1.0 -weightHapMap 1.0" +
                        " -T GenerateVariantClusters" +
                        " -B:input,VCF " + params.inVCF +
                        " -L 1:50,000,000-200,000,000" +
                        " -qual 50.0" +
                        " --ignore_filter GATK_STANDARD" +
-                        " -an QD -an HRun -an SB" +
+                        " -an QD -an MQ -an SB" +
                        " -clusterFile %s",
                Arrays.asList(params.clusterMD5));
        executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
@ -72,13 +72,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
                        " -NO_HEADER" +
                        " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
                        " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
+                        " -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
                        " -T VariantRecalibrator" +
                        " -B:input,VCF " + params.inVCF +
                        " -L 1:20,000,000-100,000,000" +
                        " --ignore_filter GATK_STANDARD" +
                        " --ignore_filter HARD_TO_VALIDATE" +
                        " -clusterFile " + getFileForMD5(params.clusterMD5) +
-                        " -titv 2.07" +
+                        " -sm TRUTH_SENSITIVITY" +
                        " -o %s" +
                        " -tranchesFile %s",
                Arrays.asList(params.recalVCFMD5, params.tranchesMD5));