From b241e0915b24b6c8744e0b31867717439fa570ee Mon Sep 17 00:00:00 2001 From: rpoplin Date: Wed, 3 Mar 2010 20:33:35 +0000 Subject: [PATCH] Incremental update to VariantOptimizer. Refactored parts of the clustering code to make it more clear. More comments. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2922 348d0f76-0448-11de-a6fe-93d51630548a --- .../VariantClusteringModel.java | 37 +++ .../variantoptimizer/VariantDataManager.java | 23 +- .../VariantGaussianMixtureModel.java | 290 +++++++++++------- .../VariantNearestNeighborsModel.java | 6 +- .../VariantOptimizationInterface.java | 2 +- .../VariantOptimizationModel.java | 7 +- .../variantoptimizer/VariantOptimizer.java | 87 +++--- .../walkers/variantoptimizer/VariantTree.java | 2 +- .../variantoptimizer/VariantTreeNode.java | 2 +- 9 files changed, 279 insertions(+), 177 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java new file mode 100755 index 000000000..86dd1854f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantClusteringModel.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 2, 2010 + */ + +public interface VariantClusteringModel extends VariantOptimizationInterface { + public void createClusters( final VariantDatum[] data ); + public double[] applyClusters( final VariantDatum[] data ); +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java index 1e28d2306..4673b8b41 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java @@ -40,35 +40,38 @@ public class VariantDataManager { public final int numAnnotations; public final double[] varianceVector; public boolean isNormalized; + private final ExpandingArrayList annotationKeys; - public VariantDataManager( final ExpandingArrayList dataList ) { + public VariantDataManager( final ExpandingArrayList dataList, final ExpandingArrayList _annotationKeys ) { numVariants = dataList.size(); data = dataList.toArray( new VariantDatum[numVariants]); if( numVariants <= 0 ) { - throw new StingException( "There are zero variants! (or possibly problem with integer overflow)" ); + throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" ); } numAnnotations = data[0].annotations.length; if( numAnnotations <= 0 ) { - throw new StingException( "There are zero annotations! (or possibly problem with integer overflow)" ); + throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" ); } varianceVector = new double[numAnnotations]; isNormalized = false; + annotationKeys = _annotationKeys; } public void normalizeData() { for( int iii = 0; iii < numAnnotations; iii++ ) { final double theMean = mean(data, iii); - final double theVariance = variance(data, theMean, iii); - varianceVector[iii] = theVariance * theVariance; // BUGBUG: model's use this as sigma^2 instead of sigma + final double theSTD = standardDeviation(data, theMean, iii); + System.out.println( (iii == numAnnotations-1 ? "QUAL" : annotationKeys.get(iii)) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); + varianceVector[iii] = theSTD * theSTD; for( int jjj=0; jjj null pointer for( int jjj = 0; jjj < numAnnotations; jjj++ ) { @@ -89,117 +116,146 @@ public class VariantGaussianMixtureModel extends VariantOptimizationModel { ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Expectation Step (calculate the probability that each data point is in each cluster) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - for( int iii = 0; iii < numVariants; iii++ ) { - double sumProb = 0.0; - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - //pVarInCluster[kkk][iii] = rand.nextDouble(); - double sum = 0.0; - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) ) - / sigma[kkk][jjj]; - } - pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum ); - - if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem - pVarInCluster[kkk][iii] = MIN_PROB; - } - - sumProb += pVarInCluster[kkk][iii]; - } - - if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - pVarInCluster[kkk][iii] /= sumProb; - } - } - } + evaluateGaussians( data, pVarInCluster ); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Maximization Step (move the clusters to maximize the sum probability of each data point) ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] = 0.0; - sigma[kkk][jjj] = 0.0; - } - } - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - double sumProb = 0.0; - for( int iii = 0; iii < numVariants; iii++ ) { - final double prob = pVarInCluster[kkk][iii]; - sumProb += prob; - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] += prob * data[iii].annotations[jjj]; - } - } - - if( sumProb > MIN_SUM_PROB ) { - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - mu[kkk][jjj] /= sumProb; - } - } - - for( int iii = 0; iii < numVariants; iii++ ) { - final double prob = pVarInCluster[kkk][iii]; - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - sigma[kkk][jjj] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]); - } - } - - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem - sigma[kkk][jjj] = MIN_PROB; - } - } - - if( sumProb > MIN_SUM_PROB ) { - for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - sigma[kkk][jjj] /= sumProb; - } - } - - pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive - } - - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Evaluate the clusters using titv as an estimate of the true positive rate - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - probTi[kkk] = 0.0; - probTv[kkk] = 0.0; - } - for( int iii = 0; iii < numVariants; iii++ ) { - if( data[iii].isTransition ) { // transition - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - probTi[kkk] += pVarInCluster[kkk][iii]; - } - } else { // transversion - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - probTv[kkk] += pVarInCluster[kkk][iii]; - } - } - } - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] ); - } - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - for( int iii = 0; iii < numVariants; iii++ ) { - pTrueVariant[ttt][iii] = 0.0; - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - pTrueVariant[ttt][iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk]; - } - } - - + maximizeGaussians( data, pVarInCluster ); System.out.println("Finished iteration " + (ttt+1) ); } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Evaluate the clusters using titv as an estimate of the true positive rate + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + evaluateGaussians( data, pVarInCluster ); // One final evaluation because the Gaussians moved in the last maximization step + + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTi[kkk] = 0.0; + probTv[kkk] = 0.0; + } + for( int iii = 0; iii < numVariants; iii++ ) { + if( data[iii].isTransition ) { // transition + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTi[kkk] += pVarInCluster[kkk][iii]; + } + } else { // transversion + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTv[kkk] += pVarInCluster[kkk][iii]; + } + } + } + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] ); + } + } + + public final double[] applyClusters( final VariantDatum[] data ) { + + final int numVariants = data.length; + + final double[] pTrueVariant = new double[numVariants]; + final double[][] pVarInCluster = new double[numGaussians][numVariants]; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Expectation Step (calculate the probability that each data point is in each cluster) + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + evaluateGaussians( data, pVarInCluster ); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + for( int iii = 0; iii < numVariants; iii++ ) { + pTrueVariant[iii] = 0.0; + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + pTrueVariant[iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk]; + } + } return pTrueVariant; } + + + private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) { + + final int numAnnotations = data[0].annotations.length; + + for( int iii = 0; iii < data.length; iii++ ) { + double sumProb = 0.0; + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + double sum = 0.0; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) ) + / sigma[kkk][jjj]; + } + pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum ); + + if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem + pVarInCluster[kkk][iii] = MIN_PROB; + } + + sumProb += pVarInCluster[kkk][iii]; + } + + if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + pVarInCluster[kkk][iii] /= sumProb; + } + } + } + } + + + private void maximizeGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) { + + final int numVariants = data.length; + final int numAnnotations = data[0].annotations.length; + + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] = 0.0; + sigma[kkk][jjj] = 0.0; + } + } + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + double sumProb = 0.0; + for( int iii = 0; iii < numVariants; iii++ ) { + final double prob = pVarInCluster[kkk][iii]; + sumProb += prob; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] += prob * data[iii].annotations[jjj]; + } + } + + if( sumProb > MIN_SUM_PROB ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] /= sumProb; + } + } + + for( int iii = 0; iii < numVariants; iii++ ) { + final double prob = pVarInCluster[kkk][iii]; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sigma[kkk][jjj] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]); + } + } + + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem + sigma[kkk][jjj] = MIN_PROB; + } + } + + if( sumProb > MIN_SUM_PROB ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sigma[kkk][jjj] /= sumProb; + } + } + + pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive + // Perhaps replace the cluster with a new random draw once pCluster gets too small + // and break up a large cluster with examples drawn from that cluster + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java index 2ad9c795c..7386f7025 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java @@ -37,11 +37,11 @@ public class VariantNearestNeighborsModel extends VariantOptimizationModel { super( _dataManager, _targetTITV ); } - public double[][] run() { + public double[] run() { final int numVariants = dataManager.numVariants; - final double[][] pTrueVariant = new double[1][numVariants]; + final double[] pTrueVariant = new double[numVariants]; final VariantTree vTree = new VariantTree( 2000 ); vTree.createTreeFromData( dataManager.data ); @@ -49,7 +49,7 @@ public class VariantNearestNeighborsModel extends VariantOptimizationModel { System.out.println("Finished creating the kd-tree."); for(int iii = 0; iii < numVariants; iii++) { - pTrueVariant[0][iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) ); + pTrueVariant[iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) ); } return pTrueVariant; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java index a6b4d7415..33fcb52ca 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java @@ -32,5 +32,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; */ public interface VariantOptimizationInterface { - public double[][] run(); + public double[] run(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java index 184d67019..ad904d54d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java @@ -42,8 +42,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt public double calcTruePositiveRateFromTITV( double titv ) { if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); } - if( titv < 0.5f ) { titv = 0.5f; } - - return ( (titv - 0.5f) / (targetTITV - 0.5f) ); + if( titv < 0.5 ) { titv = 0.5; } + return ( (titv - 0.5) / (targetTITV - 0.5) ); + //if( titv < 0.0 ) { titv = 0.0; } + //return ( titv / targetTITV ); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index 359a6792f..5268059a1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -51,11 +51,15 @@ public class VariantOptimizer extends RodWalker ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio to optimize towards. (~~2.2 for whole genome experiments)", required=true) - public double TARGET_TITV = 2.2; - @Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false) - public boolean FILTER_OUTPUT = false; - + @Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true) + private double TARGET_TITV = 2.12; + //@Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false) + //private boolean FILTER_OUTPUT = false; + @Argument(fullName = "ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) + private boolean IGNORE_INPUT_FILTERS = false; + @Argument(fullName = "exclude_annotation", shortName = "exclude", doc = "The names of the annotations which should be excluded from the calculations", required = false) + private String[] EXCLUDED_ANNOTATIONS = null; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -90,46 +94,48 @@ public class VariantOptimizer extends RodWalker double annotationValues[] = new double[numAnnotations]; for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) { - if( vc != null && vc.isSNP() ) { + + if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) { if( firstVariant ) { // This is the first variant encountered so set up the list of annotations annotationKeys.addAll( vc.getAttributes().keySet() ); - if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?? + if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field? if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); } - if( annotationKeys.contains("Dels") ) { annotationKeys.remove("Dels"); } - if( annotationKeys.contains("AN") ) { annotationKeys.remove("AN"); } + if( EXCLUDED_ANNOTATIONS != null ) { + for( final String excludedAnnotation : EXCLUDED_ANNOTATIONS ) { + if( annotationKeys.contains( excludedAnnotation ) ) { annotationKeys.remove( excludedAnnotation ); } + } + } numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL") annotationValues = new double[numAnnotations]; firstVariant = false; } - //BUGBUG: for now only using the novel SNPs - if( vc.getAttribute("ID").equals(".") ) { - int iii = 0; - for( final String key : annotationKeys ) { + int iii = 0; + for( final String key : annotationKeys ) { - double value = 0.0f; - try { - value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) ); - } catch( NumberFormatException e ) { - // do nothing, default value is 0.0f, annotations with zero variance will be ignored later - } - annotationValues[iii++] = value; + double value = 0.0; + try { + value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) ); + } catch( NumberFormatException e ) { + // do nothing, default value is 0.0, + // BUGBUG: annotations with zero variance should be ignored } - - // Variant quality ("QUAL") is not in the list of annotations - annotationValues[iii] = vc.getPhredScaledQual(); - - VariantDatum variantDatum = new VariantDatum(); - variantDatum.annotations = annotationValues; - variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; - variantDatum.isKnown = !vc.getAttribute("ID").equals("."); - variantDatum.isFiltered = vc.isFiltered(); - mapList.add( variantDatum ); + annotationValues[iii++] = value; } + + // Variant quality ("QUAL") is not in the list of annotations, but is useful so add it here. + annotationValues[iii] = vc.getPhredScaledQual(); + + VariantDatum variantDatum = new VariantDatum(); + variantDatum.annotations = annotationValues; + variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; + variantDatum.isKnown = !vc.getAttribute("ID").equals("."); + variantDatum.isFiltered = vc.isFiltered(); // BUGBUG: This field won't be needed in the final version. + mapList.add( variantDatum ); } } - return mapList; // This value isn't actually used for anything + return mapList; } //--------------------------------------------------------------------------------------------------------------- @@ -149,7 +155,7 @@ public class VariantOptimizer extends RodWalker public void onTraversalDone( ExpandingArrayList reduceSum ) { - final VariantDataManager dataManager = new VariantDataManager( reduceSum ); + final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys ); reduceSum.clear(); // Don't need this ever again, clean up some memory logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations."); @@ -158,19 +164,17 @@ public class VariantOptimizer extends RodWalker dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] final VariantOptimizationModel gmm = new VariantGaussianMixtureModel( dataManager, TARGET_TITV ); - final double[][] p = gmm.run(); - final int numIterations = 16; + final double[] p = gmm.run(); + // BUGBUG: Change to call a second ROD walker to output the new VCF file with new qual fields and filters + // Intermediate cluster ROD can be analyzed to assess clustering performance try { - final PrintStream out = new PrintStream("gmm512x16norm.data"); + final PrintStream out = new PrintStream("gmm128clusterNovel.data"); // Parse in Matlab to create performance plots for(int iii = 0; iii < dataManager.numVariants; iii++) { - for( int ttt = 0; ttt < numIterations; ttt++ ) { - out.print(p[ttt][iii] + "\t"); - } - out.println((dataManager.data[iii].isTransition ? 1 : 0) + out.print(p[iii] + "\t"); + out.println( (dataManager.data[iii].isTransition ? 1 : 0) + "\t" + (dataManager.data[iii].isKnown? 1 : 0) - + "\t" + (dataManager.data[iii].isFiltered ? 1 : 0) - ); + + "\t" + (dataManager.data[iii].isFiltered ? 1 : 0) ); } @@ -178,6 +182,7 @@ public class VariantOptimizer extends RodWalker e.printStackTrace(); System.exit(-1); } + } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java index f86d88af4..3642da9dc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java @@ -92,7 +92,7 @@ public class VariantTree { final double[] distSquared = new double[data.length]; int iii = 0; for( final VariantDatum variantDatum : data ) { - distSquared[iii] = 0.0f; + distSquared[iii] = 0.0; int jjj = 0; for( final double value : variantDatum.annotations) { final double diff = variant[jjj] - value; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java index 9f1af1b8f..02b4c35c3 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java @@ -39,7 +39,7 @@ public class VariantTreeNode { public double cutValue; public VariantDatum[] variants; - private final int minBinSize = 8000; // BUGBUG: must be bigger than KNN + private final int minBinSize = 8000; // BUGBUG: must be larger than number of kNN public VariantTreeNode() { left = null;