From 8f15b2ba72d44a794935a546db93ca6378594a68 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Tue, 17 Aug 2010 21:57:28 +0000 Subject: [PATCH] Memory optimization for the VariantRecalibrator. Only add variants to the list if they pass the novelty and qual filters. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4051 348d0f76-0448-11de-a6fe-93d51630548a --- .../GenerateVariantClustersWalker.java | 7 +- .../VariantGaussianMixtureModel.java | 102 ++++++------------ ...ntRecalibrationWalkersIntegrationTest.java | 4 +- 3 files changed, 40 insertions(+), 73 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java index e57fb4f52..bc60bed65 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GenerateVariantClustersWalker.java @@ -153,7 +153,6 @@ public class GenerateVariantClustersWalker extends RodWalker 0.0 && variantDatum.qual > QUAL_THRESHOLD ) { + mapList.add( variantDatum ); + } } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java index 715a13fb5..ffcbd2085 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantGaussianMixtureModel.java @@ -188,77 +188,41 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // Initialize the Allele Count prior generateAlleleCountPrior(); - int numValid = 0; - int numOutlier = 0; - int numBadQual = 0; - int numZeroWeight = 0; +// int numValid = 0; +// int numOutlier = 0; +// int numBadQual = 0; +// int numZeroWeight = 0; - // Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value or having low quality score - for( final VariantDatum datum : dataManager.data ) { - boolean goodVar = true; - if(!(datum.weight > 0.0)) { - goodVar = false; - numZeroWeight++; - } - if(goodVar) { - for( final double val : datum.annotations ) { - if( Math.abs(val) > stdThreshold ) { - goodVar = false; - numOutlier++; - break; - } + // Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value + // Filtering based on known status and qual threshold happens in GenerateVariantClusters + for( int iii = 0; iii < dataManager.data.length; iii++ ) { + final VariantDatum datum = dataManager.data[iii]; + for( final double val : datum.annotations ) { + if( Math.abs(val) > stdThreshold ) { + datum.weight = 0.0; + break; } } - if(goodVar) { - if( datum.qual < qualThreshold ) { - goodVar = false; - numBadQual++; - } - } - if(goodVar) { numValid++; } } - final VariantDatum data[] = new VariantDatum[numValid]; - int iii = 0; - for( final VariantDatum datum : dataManager.data ) { - boolean goodVar = true; - if(!(datum.weight > 0.0)) { - goodVar = false; - } - if(goodVar) { - for( final double val : datum.annotations ) { - if( Math.abs(val) > stdThreshold ) { - goodVar = false; - break; - } - } - } - if(goodVar) { - if( datum.qual < qualThreshold ) { - goodVar = false; - } - } - if(goodVar) { data[iii++] = datum; } - } +// logger.info("Clustering with " + data.length + " valid variants."); +// logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight."); +// logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value."); +// logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ")."); - logger.info("Clustering with " + data.length + " valid variants."); - logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight."); - logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value."); - logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ")."); - - generateEmpricalStats( data ); + generateEmpricalStats( dataManager.data ); logger.info("Initializing using k-means..."); - initializeUsingKMeans( data ); + initializeUsingKMeans( dataManager.data ); logger.info("... done!"); - createClusters( data, 0, maxGaussians, clusterFileName ); + createClusters( dataManager.data, 0, maxGaussians, clusterFileName ); // Simply cluster with all the variants. The knowns have been given more weight than the novels //logger.info("Clustering with " + dataManager.data.length + " variants."); //createClusters( dataManager.data, 0, numGaussians, clusterFileName ); } - private void generateEmpricalStats( VariantDatum[] data ) { + private void generateEmpricalStats( final VariantDatum[] data ) { final int numVariants = data.length; final int numAnnotations = data[0].annotations.length; @@ -323,7 +287,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } - public void setSingletonFPRate(double rate) { + public void setSingletonFPRate( final double rate ) { this.singletonFPRate = rate; } @@ -358,19 +322,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final int[] assignment = new int[numVariants]; for( int iii = 0; iii < numVariants; iii++ ) { final VariantDatum datum = data[iii]; - double minDistance = Double.MAX_VALUE; - int minCluster = -1; - for( int kkk = 0; kkk < maxGaussians; kkk++ ) { - double dist = 0.0; - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]); - } - if(dist < minDistance) { - minDistance = dist; - minCluster = kkk; + if(datum.weight > 0.0) { + double minDistance = Double.MAX_VALUE; + int minCluster = -1; + for( int kkk = 0; kkk < maxGaussians; kkk++ ) { + double dist = 0.0; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]); + } + if(dist < minDistance) { + minDistance = dist; + minCluster = kkk; + } } + assignment[iii] = minCluster; } - assignment[iii] = minCluster; } for( int kkk = 0; kkk < maxGaussians; kkk++ ) { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index 1987855f8..48c8bd2fb 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testGenerateVariantClusters() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "005898194773d2f2be45a68f6e4c4c25" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey(); @@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testVariantRecalibrator() { HashMap e = new HashMap(); - e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "3c5a41567b60b777875d585a379eb231" ); + e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "acb89428115564cbf35ab7c006252108" ); for ( Map.Entry entry : e.entrySet() ) { String vcf = entry.getKey();