From e3962c0d13d7a64e4d838bcfc44ebf22fd4070c0 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 3 Sep 2010 15:50:19 +0000 Subject: [PATCH] VR integration tests are longer but much more useful. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4210 348d0f76-0448-11de-a6fe-93d51630548a --- .../ApplyVariantCuts.java | 1 - .../GenerateVariantClustersWalker.java | 6 ++--- .../VariantGaussianMixtureModel.java | 8 ++++-- .../org/broadinstitute/sting/BaseTest.java | 1 + ...ntRecalibrationWalkersIntegrationTest.java | 27 ++++++++++--------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java index c4ebbacef..b77369096 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyVariantCuts.java @@ -134,7 +134,6 @@ public class ApplyVariantCuts extends RodWalker { public void initialize() { // todo -- ryan, it's always best to use a data structure, I need to read these in too. - // todo -- I would have updated your code but there's no integration test to protect me from unexpected effects boolean firstLine = true; try { for( final String line : new XReadLines( TRANCHES_FILE ) ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index bb0b3bed9..1ecbb22dc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -139,10 +139,8 @@ public class GenerateVariantClustersWalker extends RodWalker 0 clustering weight and " + dataManager.numAnnotations + " annotations." ); - logger.info( "The annotations are: " + annotationKeys ); + logger.info( "There are " + dataManager.numVariants + " variants with > 0 clustering weight and qual > threshold (--qualThreshold = " + QUAL_THRESHOLD + ")" ); + logger.info( "The annotations used for clustering are: " + annotationKeys ); dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java index b35784865..d9cdbd243 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java @@ -68,7 +68,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel private double[] pClusterLog10; private final double[] determinant; private final double stdThreshold; - private double singletonFPRate = -1; // Est. FP rate for singleton calls. Used to estimate FP rate as a function of AC + private double singletonFPRate = -1; // Estimated FP rate for singleton calls. Used to estimate FP rate as a function of AC private double[] empiricalMu; private Matrix empiricalSigma; @@ -181,16 +181,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value // Filtering based on known status and qual threshold happens in GenerateVariantClusters + long numAboveSTD = 0L; for( int iii = 0; iii < dataManager.data.length; iii++ ) { final VariantDatum datum = dataManager.data[iii]; for( final double val : datum.annotations ) { if( Math.abs(val) > stdThreshold ) { datum.weight = 0.0; + numAboveSTD++; break; } } } + logger.info( numAboveSTD + " variants were rejected from the training set for having annotation values more than X standard deviations away from the mean. (--stdThreshold = " + stdThreshold + ")" ); + generateEmpricalStats( dataManager.data ); logger.info("Initializing using k-means..."); @@ -338,7 +342,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel if(rand.nextBoolean()) { randSigma[ppp][jjj] *= -1.0; } - if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying by its transpose + if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose } } if( FORCE_INDEPENDENT_ANNOTATIONS ) { diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index e89f286d9..d31c89dbe 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -48,6 +48,7 @@ public abstract class BaseTest { protected static String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; protected static String validationDataLocation = GATKDataLocation + "Validation_Data/"; protected static String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; + protected static String comparisonDataLocation = GATKDataLocation + "Comparisons/"; protected static String testDir = "testdata/"; protected static boolean alreadySetup = false; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 8e81afaba..f12507c98 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -14,8 +14,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testGenerateVariantClusters() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "cb3e8d9072d478243c3f9d0ee09fef3b" ); - e.put( validationDataLocation + "lowpass.N3.chr1.raw.0.vcf", "0d4fee916a886ca98c286d3b7fed0ff6" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "7c5431a560e9ca257523cf68b808b058" ); + e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "a438635bc96a80c5e6f090d82a394819" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -24,11 +24,12 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf" + - " -weightDBSNP 1.0 -weightHapMap 1.0" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + + " -weightDBSNP 0.2 -weightHapMap 1.0" + " -T GenerateVariantClusters" + " -B:input,VCF " + vcf + - " -L 1:1-100,000,000" + + " -L 1:50,000,000-200,000,000" + + " -qual 50.0" + " --ignore_filter GATK_STANDARD" + " -an QD -an HRun -an SB" + " -clusterFile %s", @@ -43,9 +44,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { public void testVariantRecalibrator() { HashMap> e = new HashMap>(); e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", - Arrays.asList("9c25a88c9fa48d7373029d2bfb40ad54", "937080353c7e03e11f8a70fc0004bf76","5b89fa5a4edf0080d64230d4103d2b8d")); // Each test checks the md5 of three output files - e.put( validationDataLocation + "lowpass.N3.chr1.raw.0.vcf", - Arrays.asList("074462f829ff553584e9b5e330be6d4b", "f7c5c6cff9dd5280b25e24e0591e4cb0","1de1473db5720b882edf1381fa3dd039")); // Each test checks the md5 of three output files + Arrays.asList("dad6d1458a453a1b720bd79be922df73", "2a70516b26f6ccf9db638153661eee62","c1952240ff3105354e04363c18a8eab9")); // Each test checks the md5 of three output files + e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", + Arrays.asList("89d7e43d9f1086c859abb38fb1b51d40", "d2471d2dd956392de7e776463299e28d","eeafcb907f93ace2ac47a98497b52b0d")); // Each test checks the md5 of three output files for ( Map.Entry> entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -56,10 +57,10 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + - " -B:hapmap,VCF " + validationDataLocation + "CEU_hapmap_nogt_23.vcf" + + " -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/genotypes_r27_nr.b36_fwd.vcf" + " -T VariantRecalibrator" + " -B:input,VCF " + vcf + - " -L 1:40,000,000-100,000,000" + + " -L 1:20,000,000-100,000,000" + " --ignore_filter GATK_STANDARD" + " --ignore_filter HARD_TO_VALIDATE" + " -clusterFile " + clusterFile + @@ -81,8 +82,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testApplyVariantCuts() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "5b00a80b2a8fd078d6c2aa16924920e4" ); - e.put( validationDataLocation + "lowpass.N3.chr1.raw.0.vcf", "d785380792da88f8d64fe1cf1133eeb0" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "5511e26689ddd1486049db083dcd79c0" ); + e.put( validationDataLocation + "lowpass.N3.chr1.raw.vcf", "4f7f13d1f190e882a466003ce0917222" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -96,7 +97,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { "-R " + b36KGReference + " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + " -T ApplyVariantCuts" + - " -L 1:40,000,000-100,000,000" + + " -L 1:20,000,000-100,000,000" + " -B:input,VCF " + inputVCFFile + " -o %s" + " -tranchesFile " + tranchesFile,