diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index de111af6f..e74f0732b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -87,7 +87,7 @@ public class GenerateVariantClustersWalker extends RodWalker 0.0) { + if(assignment[iii] == kkk) { + numAssigned++; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] += data[iii].annotations[jjj]; + } } } } @@ -924,6 +927,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel return Math.log(x) - ( 1.0 / (2.0 * x) ) - ( 1.0 / (12.0 * Math.pow(x, 2.0)) ) + ( 1.0 / (120.0 * Math.pow(x, 4.0)) ) - - ( 1.0 / (252.0 * Math.pow(x, 6.0)) ); + - ( 1.0 / (252.0 * Math.pow(x, 6.0)) ); } } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index 68017b039..afdf151d6 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -371,6 +371,14 @@ public class MathUtils { return Math.sqrt(rms); } + public static double distanceSquared( final double[] x, final double[] y ) { + double dist = 0.0; + for(int iii = 0; iii < x.length; iii++) { + dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); + } + return dist; + } + /** * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). * @@ -401,6 +409,27 @@ public class MathUtils { return normalized; } + public static double[] normalizeFromLog10(List array, boolean takeLog10OfOutput) { + double[] normalized = new double[array.size()]; + + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = MathUtils.arrayMax( array ); + for (int i = 0; i < array.size(); i++) + normalized[i] = Math.pow(10, array.get(i) - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.size(); i++) + sum += normalized[i]; + for (int i = 0; i < array.size(); i++) { + double x = normalized[i] / sum; + if ( takeLog10OfOutput ) x = Math.log10(x); + normalized[i] = x; + } + + return normalized; + } /** * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). * @@ -412,6 +441,10 @@ public class MathUtils { return normalizeFromLog10(array, false); } + public static double[] normalizeFromLog10(List array) { + return normalizeFromLog10(array, false); + } + public static int maxElementIndex(double[] array) { if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); @@ -469,6 +502,15 @@ public class MathUtils { return m; } + public static double arrayMax(List array) { + if ( array == null ) throw new IllegalArgumentException("Array cannot be null!"); + if ( array.size() == 0 ) throw new IllegalArgumentException("Array size cannot be 0!"); + + double m = array.get(0); + for ( double e : array ) m = Math.max(m, e); + return m; + } + public static double average(List vals, int maxI) { long sum = 0L; @@ -814,6 +856,15 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); } + // from http://en.wikipedia.org/wiki/Digamma_function + // According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by: + public static double diGamma(final double x) { + return Math.log(x) - ( 1.0 / (2.0 * x) ) + - ( 1.0 / (12.0 * Math.pow(x, 2.0)) ) + + ( 1.0 / (120.0 * Math.pow(x, 4.0)) ) + - ( 1.0 / (252.0 * Math.pow(x, 6.0)) ); + } + /** A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that * it could overflow any naive summation-based scheme or cause loss of precision). diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index a98235d79..3764859cc 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -29,16 +29,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", - "4eeffa7a1965ce0c25c5edd0bae76290", // in vcf - "a04e00d00c991d76900634376bc1a9d1", // tranches - "f34b36c1da8bcb080a584592d1f6dae7", // recalVCF - "ee07ff6c0ff6108e00b131ce3c7a7f65"); // cut VCF + "e14079c3a02c112665a6c194fd4f5d5c", // cluster file + "dce581b880ffb6ea39cbada1ecc95915", // tranches + "c3e8a2f43656eab7d847dbf850f844a6", // recalVCF + "50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf", - "8937a3ae7f176dacf47b8ee6c0023416", // in vcf - "8da48dec888c9d4e3c3c7ed7943c0c61", // tranches - "ae6a1e0874c966312e891b5a3c47b0e3", // recalVCF - "3ac16abdc5bbd5148f0cf88e8a7af3c9"); // cut VCF + "b0c0f8c8d9fe3d1ed2bde0b1eb82a22d", // cluster file + "66edae83c50f4e8601fef7fafba774af", // tranches + "0123537e373657386068a534c0f5c91b", // recalVCF + "2172368e8585841e5ad96c95d0827c4b"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -51,14 +51,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { "-R " + b36KGReference + " -NO_HEADER" + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + - " -weightDBSNP 0.2 -weightHapMap 1.0" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + + " -weightDBSNP 1.0 -weightHapMap 1.0" + " -T GenerateVariantClusters" + " -B:input,VCF " + params.inVCF + " -L 1:50,000,000-200,000,000" + " -qual 50.0" + " --ignore_filter GATK_STANDARD" + - " -an QD -an HRun -an SB" + + " -an QD -an MQ -an SB" + " -clusterFile %s", Arrays.asList(params.clusterMD5)); executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst(); @@ -72,13 +72,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -NO_HEADER" + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + + " -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" + " -T VariantRecalibrator" + " -B:input,VCF " + params.inVCF + " -L 1:20,000,000-100,000,000" + " --ignore_filter GATK_STANDARD" + " --ignore_filter HARD_TO_VALIDATE" + " -clusterFile " + getFileForMD5(params.clusterMD5) + - " -titv 2.07" + + " -sm TRUTH_SENSITIVITY" + " -o %s" + " -tranchesFile %s", Arrays.asList(params.recalVCFMD5, params.tranchesMD5));