From b42e0a398ee9a9c472609caf00a651f9d33ccd53 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 8 Mar 2010 13:02:08 +0000 Subject: [PATCH] Bug fix in variant optimizer for when there are more novel variants than known variants in the callset. Changing the magic numbers related to the starting sigma values for the gaussian clusters. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2952 348d0f76-0448-11de-a6fe-93d51630548a --- .../VariantGaussianMixtureModel.java | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index 4de6697a9..30717a1a7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -74,21 +74,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel numSubset++; } } - final VariantDatum[] data = new VariantDatum[numSubset*2]; - int iii = 0; - for( final VariantDatum datum : dataManager.data ) { - if( !datum.isKnown ) { - data[iii++] = datum; + VariantDatum[] data; + + if( numSubset * 2 * 1.3 < dataManager.numVariants ) { + data = new VariantDatum[numSubset*2]; + int iii = 0; + for( final VariantDatum datum : dataManager.data ) { + if( !datum.isKnown ) { + data[iii++] = datum; + } } - } - while( iii < numSubset*2 ) { // grab an equal number of known variants at random - final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; - if( datum.isKnown ) { - data[iii++] = datum; + while( iii < numSubset*2 ) { // grab an equal number of known variants at random + final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; + if( datum.isKnown ) { + data[iii++] = datum; + } } + } else { + data = dataManager.data; } - System.out.println("Clustering with " + numSubset*2 + " variants..."); + System.out.println("Clustering with " + data.length + " variants..."); + if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2*numNovel is so large compared to the full set) "); } createClusters( data ); // Using a subset of the data System.out.println("Printing out cluster parameters..."); printClusters( outputPrefix ); @@ -124,7 +131,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final double[] randSigma = new double[numAnnotations]; if( dataManager.isNormalized ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - randSigma[jjj] = 0.9 + 0.2 * rand.nextDouble(); + randSigma[jjj] = 0.75 + 0.4 * rand.nextDouble(); } } else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer for( int jjj = 0; jjj < numAnnotations; jjj++ ) { @@ -414,7 +421,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // Find a place to put the example variant if( rrr == 0 ) { // Replace the big cluster that kicked this process off mu[kkk] = data[randVarIndex].annotations; - //sigma[kkk] = savedSigma; pCluster[kkk] = 1.0 / ((double) numGaussians); } else { // Replace the cluster with the minimum prob double minProb = pCluster[0]; @@ -428,7 +434,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel mu[minClusterIndex] = data[randVarIndex].annotations; sigma[minClusterIndex] = savedSigma; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - sigma[minClusterIndex][jjj] += -0.02 + 0.04 * rand.nextDouble(); + sigma[minClusterIndex][jjj] += -0.06 + 0.12 * rand.nextDouble(); + if( sigma[minClusterIndex][jjj] < MIN_SUM_PROB ) { + sigma[minClusterIndex][jjj] = MIN_SUM_PROB; + } } pCluster[minClusterIndex] = 1.0 / ((double) numGaussians); } @@ -444,7 +453,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final double[] randSigma = new double[numAnnotations]; if( dataManager.isNormalized ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - randSigma[jjj] = 0.9 + 0.2 * rand.nextDouble(); + randSigma[jjj] = 0.6 + 0.4 * rand.nextDouble(); // Explore a wider range } } else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer for( int jjj = 0; jjj < numAnnotations; jjj++ ) {