Initial check in of VariantOptimizer in playground. There is a Gaussian Mixture Model version and a k-Nearest Neighbors version. There is still lots of work to do. Nobody should be using it yet.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2904 348d0f76-0448-11de-a6fe-93d51630548a
2010-03-01 15:26:18 +00:00 · 2010-03-01 15:26:18 +00:00 · 3a863d3e8c
parent 6133d73bf0
commit 3a863d3e8c
9 changed files with 705 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java
@ -0,0 +1,63 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+import org.broadinstitute.sting.utils.ExpandingArrayList;
+import org.broadinstitute.sting.utils.StingException;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 26, 2010
+ */
+
+public class VariantDataManager {
+    public final VariantDatum[] data;
+    public final int numVariants;
+    public final int numAnnotations;
+    public final double[] varianceVector;
+    public boolean isNormalized;
+
+    public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList ) {
+        numVariants = dataList.size();
+        data = dataList.toArray( new VariantDatum[numVariants]);
+        if( numVariants <= 0 ) {
+            throw new StingException( "There are zero variants! (or possibly problem with integer overflow)" );
+        }
+        numAnnotations = data[0].annotations.length;
+        if( numAnnotations <= 0 ) {
+            throw new StingException( "There are zero annotations! (or possibly problem with integer overflow)" );
+        }
+        varianceVector = new double[numAnnotations];
+        isNormalized = false;
+    }
+
+    public void normalizeData() {
+        for( int iii = 0; iii < numAnnotations; iii++ ) {
+            final double theMean = mean(data, iii);
+            final double theVariance = variance(data, theMean, iii);
+            varianceVector[iii] = theVariance * theVariance; // BUGBUG: model's use this as sigma^2 instead of sigma
+            for( int jjj=0; jjj<numVariants; jjj++ ) {
+                data[jjj].annotations[iii] = ( data[jjj].annotations[iii] - theMean ) / theVariance;
+            }
+        }
+        isNormalized = true;
+    }
+
+    private static double mean( final VariantDatum[] data, final int index ) {
+        double sum = 0.0f;
+        final int numVars = data.length;
+        for( int iii = 0; iii < numVars; iii++ ) {
+            sum += (data[iii].annotations[index] / ((double) numVars));
+        }
+        return sum;
+    }
+
+    private static double variance( final VariantDatum[] data, final double mean, final int index ) {
+        double sum = 0.0f;
+        final int numVars = data.length;
+        for( int iii = 0; iii < numVars; iii++ ) {
+            sum += ( ((data[iii].annotations[index] - mean)*(data[iii].annotations[index] - mean)) / ((double) numVars));
+        }
+        return Math.sqrt( sum );
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java
@ -0,0 +1,14 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 24, 2010
+ */
+
+public class VariantDatum {
+    public double[] annotations;
+    public boolean isTransition;
+    public boolean isKnown;
+    public boolean isFiltered;
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
@ -0,0 +1,180 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+import java.util.Random;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 26, 2010
+ */
+
+public class VariantGaussianMixtureModel extends VariantOptimizationModel {
+
+    private final int numGaussians = 512;
+    private final long RANDOM_SEED = 91801305;
+    private final double MIN_PROB = 1E-30;
+    private final double MIN_SUM_PROB = 1E-20;
+
+    public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV ) {
+       super( _dataManager, _targetTITV );
+    }
+    
+    public double[][] run() {
+
+        final int numVariants = dataManager.numVariants;
+        final int numAnnotations = dataManager.numAnnotations;
+        final VariantDatum[] data = dataManager.data;
+        final int numIterations = 16;
+
+        final double[][] pTrueVariant = new double[numIterations][numVariants];
+        final double[] pCluster = new double[numGaussians];
+        final double[] probTi = new double[numGaussians];
+        final double[] probTv = new double[numGaussians];
+        final double[] clusterTruePositiveRate = new double[numGaussians];
+        final double[][] pVarInCluster = new double[numGaussians][numVariants];
+        final double[][] mu = new double[numGaussians][];
+        final double[][] sigma = new double[numGaussians][];
+        final Random rand = new Random( RANDOM_SEED );
+
+
+        // loop control variables:
+        // iii - loop over data points
+        // jjj - loop over annotations (features)
+        // kkk - loop over clusters
+        // ttt - loop over EM iterations
+
+        for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+            pCluster[kkk] = 1.0;
+            mu[kkk] = data[rand.nextInt(numVariants)].annotations;
+            final double[] randSigma = new double[numAnnotations];
+            if( dataManager.isNormalized ) {
+                for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    randSigma[jjj] = 0.9 + 0.2 * rand.nextDouble(); 
+                }
+            } else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer
+                for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    randSigma[jjj] = dataManager.varianceVector[jjj] + ((1.0 + rand.nextDouble()) * 0.01 * dataManager.varianceVector[jjj]);
+                }
+            }
+            sigma[kkk] = randSigma;
+        }
+
+        for( int ttt = 0; ttt < numIterations; ttt++ ) {
+
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            // Expectation Step (calculate the probability that each data point is in each cluster)
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            for( int iii = 0; iii < numVariants; iii++ ) {
+                double sumProb = 0.0;
+                for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                    //pVarInCluster[kkk][iii] = rand.nextDouble();
+                    double sum = 0.0;
+                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                        sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) )
+                                / sigma[kkk][jjj];
+                    }
+                    pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum );
+
+                    if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
+                        pVarInCluster[kkk][iii] = MIN_PROB;
+                    }
+
+                    sumProb += pVarInCluster[kkk][iii];
+                }
+
+                if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem
+                    for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                        pVarInCluster[kkk][iii] /= sumProb;
+                    }
+                }
+            }
+
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            // Maximization Step (move the clusters to maximize the sum probability of each data point)
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    mu[kkk][jjj] = 0.0;
+                    sigma[kkk][jjj] = 0.0;
+                }
+            }
+            for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                double sumProb = 0.0;
+                for( int iii = 0; iii < numVariants; iii++ ) {
+                    final double prob = pVarInCluster[kkk][iii];
+                    sumProb += prob;
+                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                        mu[kkk][jjj] +=  prob * data[iii].annotations[jjj];
+                    }
+                }
+
+                if( sumProb > MIN_SUM_PROB ) {
+                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                        mu[kkk][jjj] /=  sumProb;
+                    }
+                }
+
+                for( int iii = 0; iii < numVariants; iii++ ) {
+                    final double prob = pVarInCluster[kkk][iii];
+                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                        sigma[kkk][jjj] +=  prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]);
+                    }
+                }
+
+                for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                    if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem
+                        sigma[kkk][jjj] = MIN_PROB;
+                    }
+                }
+
+                if( sumProb > MIN_SUM_PROB ) {
+                    for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                        sigma[kkk][jjj] /=  sumProb;
+                    }
+                }
+
+                pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive
+            }
+
+
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            // Evaluate the clusters using titv as an estimate of the true positive rate
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                probTi[kkk] = 0.0;
+                probTv[kkk] = 0.0;
+            }
+            for( int iii = 0; iii < numVariants; iii++ ) {
+                if( data[iii].isTransition ) { // transition
+                    for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                        probTi[kkk] += pVarInCluster[kkk][iii];
+                    }
+                } else { // transversion
+                    for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                        probTv[kkk] += pVarInCluster[kkk][iii];
+                    }
+                }
+            }
+            for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] );
+            }
+
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
+            /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            for( int iii = 0; iii < numVariants; iii++ ) {
+                pTrueVariant[ttt][iii] = 0.0;
+                for( int kkk = 0; kkk < numGaussians; kkk++ ) {
+                    pTrueVariant[ttt][iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk];
+                }
+            }
+
+
+
+            System.out.println("Finished iteration " + (ttt+1) );
+        }
+
+
+        return pTrueVariant;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java
@ -0,0 +1,32 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Mar 1, 2010
+ */
+
+public class VariantNearestNeighborsModel extends VariantOptimizationModel {
+
+    public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV ) {
+       super( _dataManager, _targetTITV );
+    }
+    
+    public double[][] run() {
+
+        final int numVariants = dataManager.numVariants;
+
+        final double[][] pTrueVariant = new double[1][numVariants];
+
+        final VariantTree vTree = new VariantTree( 2000 );
+        vTree.createTreeFromData( dataManager.data );
+
+        System.out.println("Finished creating the kd-tree.");
+
+        for(int iii = 0; iii < numVariants; iii++) {
+            pTrueVariant[0][iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) );
+        }
+
+        return pTrueVariant;
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java
@ -0,0 +1,11 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 26, 2010
+ */
+
+public interface VariantOptimizationInterface {
+    public double[][] run();
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java
@ -0,0 +1,24 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 26, 2010
+ */
+
+public abstract class VariantOptimizationModel implements VariantOptimizationInterface {
+    protected final VariantDataManager dataManager;
+    protected final double targetTITV;
+
+    public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) {
+        dataManager = _dataManager;
+        targetTITV = _targetTITV;
+    }
+
+    public double calcTruePositiveRateFromTITV( double titv ) {
+        if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
+        if( titv < 0.5f ) { titv = 0.5f; }
+
+        return ( (titv - 0.5f) / (targetTITV - 0.5f) );
+    }
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
@ -0,0 +1,183 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.ExpandingArrayList;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+
+import java.io.PrintStream;
+
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for
+ *  an expected transition / transversion ratio.
+ *
+ * @author rpoplin
+ * @since Feb 11, 2010
+ *
+ * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset
+ */
+
+public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
+
+    /////////////////////////////
+    // Command Line Arguments
+    /////////////////////////////
+    @Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio to optimize towards. (~~2.2 for whole genome experiments)", required=true)
+    public double TARGET_TITV = 2.2;
+    @Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false)
+    public boolean FILTER_OUTPUT = false;
+    
+    /////////////////////////////
+    // Private Member Variables
+    /////////////////////////////
+    private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
+    private boolean firstVariant = true;
+    private int numAnnotations = 0;
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+
+        final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
+
+        if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
+            return mapList;
+        }
+
+        double annotationValues[] = new double[numAnnotations];
+
+        for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
+            if( vc != null && vc.isSNP() ) {
+                if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
+                    annotationKeys.addAll( vc.getAttributes().keySet() );
+                    if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field??
+                    if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); }
+                    if( annotationKeys.contains("Dels") ) { annotationKeys.remove("Dels"); }
+                    if( annotationKeys.contains("AN") ) { annotationKeys.remove("AN"); }
+                    numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL")
+                    annotationValues = new double[numAnnotations];
+                    firstVariant = false;
+                }
+
+                //BUGBUG: for now only using the novel SNPs
+                if( vc.getAttribute("ID").equals(".") ) {
+                    int iii = 0;
+                    for( final String key : annotationKeys ) {
+
+                        double value = 0.0f;
+                        try {
+                            value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) );
+                        } catch( NumberFormatException e ) {
+                            // do nothing, default value is 0.0f, annotations with zero variance will be ignored later
+                        }
+                        annotationValues[iii++] = value;
+                    }
+
+                    // Variant quality ("QUAL") is not in the list of annotations
+                    annotationValues[iii] = vc.getPhredScaledQual();
+
+                    VariantDatum variantDatum = new VariantDatum();
+                    variantDatum.annotations = annotationValues;
+                    variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
+                    variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
+                    variantDatum.isFiltered = vc.isFiltered();
+                    mapList.add( variantDatum );
+                }
+            }
+        }
+
+        return mapList; // This value isn't actually used for anything
+    }
+
+    //---------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    //---------------------------------------------------------------------------------------------------------------
+
+    public ExpandingArrayList<VariantDatum> reduceInit() {
+        return new ExpandingArrayList<VariantDatum>();
+    }
+
+    public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
+        reduceSum.addAll( mapValue );
+        return reduceSum;
+    }
+
+    public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
+
+        final VariantDataManager dataManager = new VariantDataManager( reduceSum );
+        reduceSum.clear(); // Don't need this ever again, clean up some memory
+
+        logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
+        logger.info( "The annotations are: " + annotationKeys + " and QUAL." );
+
+        dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
+
+        final VariantOptimizationModel gmm = new VariantGaussianMixtureModel( dataManager, TARGET_TITV );
+        final double[][] p = gmm.run();
+        final int numIterations = 16;
+
+        try {
+            final PrintStream out = new PrintStream("gmm512x16norm.data");
+            for(int iii = 0; iii < dataManager.numVariants; iii++) {
+                for( int ttt = 0; ttt < numIterations; ttt++ ) {
+                    out.print(p[ttt][iii] + "\t");
+                }
+                out.println((dataManager.data[iii].isTransition ? 1 : 0)
+                        + "\t" + (dataManager.data[iii].isKnown? 1 : 0)
+                        + "\t" + (dataManager.data[iii].isFiltered ? 1 : 0)
+                );
+            }
+
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            System.exit(-1);
+        }
+    }
+
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java
@ -0,0 +1,119 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+import org.broadinstitute.sting.utils.StingException;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 24, 2010
+ */
+
+public class VariantTree {
+    private final VariantTreeNode root;
+    private final int numKNN;
+
+    public VariantTree( final int _numKNN ) {
+        root = new VariantTreeNode();
+        numKNN = _numKNN;
+    }
+
+    public void createTreeFromData( VariantDatum[] data ) {
+        root.cutData( data, 0, 0, data[0].annotations.length );
+    }
+
+    public double calcNeighborhoodTITV( final VariantDatum variant ) {
+
+        double[] distances;
+
+        // Grab the subset of points that are approximately near this point
+        final VariantDatum[] data = getBin( variant.annotations, root );
+        if( data.length < numKNN ) {
+            throw new StingException( "Bin is too small. Should be > " + numKNN );
+        }
+
+        // Find the X nearest points in the subset
+        final double[] originalDistances = calcDistances( variant.annotations, data );
+        distances = originalDistances.clone();
+        quickSort( distances, 0, distances.length-1 ); // BUGBUG: distances.length or distances.length-1
+
+        final double minDistance = distances[numKNN - 1];
+
+        // Calculate probability of being true based on this set of SNPs
+        int numTi = 0;
+        int numTv = 0;
+        for( int iii = 0; iii < distances.length; iii++ ) {
+            if( originalDistances[iii] <= minDistance ) {
+                if( data[iii].isTransition ) { numTi++; }
+                else { numTv++; }
+            }
+        }
+
+        return ((double) numTi) / ((double) numTv);
+    }
+
+    private VariantDatum[] getBin( final double[] variant, final VariantTreeNode node ) {
+        if( node.variants != null ) {
+            return node.variants;
+        } else {
+            if( variant[node.cutDim] < node.cutValue ) {
+                return getBin( variant, node.left );
+            } else {
+                return getBin( variant, node.right );
+            }
+        }
+    }
+
+    private double[] calcDistances( final double[] variant, final VariantDatum[] data ) {
+        final double[] distSquared = new double[data.length];
+        int iii = 0;
+        for( final VariantDatum variantDatum : data ) {
+            distSquared[iii] = 0.0f;
+            int jjj = 0;
+            for( final double value : variantDatum.annotations) {
+                final double diff = variant[jjj] - value;
+                distSquared[iii] += ( diff * diff );
+                jjj++;
+            }
+            iii++;
+        }
+
+        return distSquared;
+    }
+
+    public static int partition(final double arr[], final int left, final int right)
+    {
+          int i = left, j = right;
+          double tmp;
+          final double pivot = arr[(left + right) / 2];
+
+          while (i <= j) {
+                while (arr[i] < pivot) {
+                      i++;
+                }
+                while (arr[j] > pivot) {
+                      j--;
+                }
+                if (i <= j) {
+                      tmp = arr[i];
+                      arr[i] = arr[j];
+                      arr[j] = tmp;
+                      i++;
+                      j--;
+                }
+          }
+
+          return i;
+    }
+
+    public static void quickSort(final double arr[], final int left, final int right) {
+          final int index = partition(arr, left, right);
+          if (left < index - 1) {
+                quickSort(arr, left, index - 1);
+          }
+          if (index < right) {
+                quickSort(arr, index, right);
+          }
+    }
+
+
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java
@ -0,0 +1,79 @@
+package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: rpoplin
+ * Date: Feb 24, 2010
+ */
+
+public class VariantTreeNode {
+
+    public VariantTreeNode left;
+    public VariantTreeNode right;
+    public int cutDim;
+    public double cutValue;
+    public VariantDatum[] variants;
+
+    private final int minBinSize = 8000; // BUGBUG: must be bigger than KNN
+
+    public VariantTreeNode() {
+        left = null;
+        right = null;
+        variants = null;
+        cutDim = -1;
+        cutValue = -1;
+    }
+
+    public void cutData( final VariantDatum[] data, final int depth, final int lastCutDepth, final int numAnnotations ) {
+
+        cutDim = depth % numAnnotations;
+
+        if( depth != lastCutDepth && (cutDim == (lastCutDepth % numAnnotations)) ) { // Base case: we've tried to cut on all the annotations
+            variants = data;
+            return;
+        }
+
+        final double[] values = new double[data.length];
+        for( int iii = 0; iii < data.length; iii++ ) {
+            values[iii] = data[iii].annotations[cutDim];
+        }
+
+        final double[] sortedValues = values.clone();
+        VariantTree.quickSort( sortedValues, 0, values.length-1 ); // BUGBUG: values.length or values.length-1
+
+        final int lowPivotIndex = Math.round(0.40f * sortedValues.length);
+        final int highPivotIndex = Math.round(0.60f * sortedValues.length);
+        final double lowPivot = sortedValues[lowPivotIndex];
+        final double highPivot = sortedValues[highPivotIndex];
+        cutValue = highPivot;
+
+        int numLow = 0;
+        int numHigh = 0;
+        for( int iii = 0; iii < data.length; iii++ ) {
+            if( values[iii] < highPivot ) { numLow++; }
+            if( values[iii] >= lowPivot ) { numHigh++; }
+        }
+
+        // If cutting here makes the bin too small then don't cut
+        if( numLow < minBinSize || numHigh < minBinSize  || (numLow == numHigh && numLow == data.length) ) {
+            cutValue = sortedValues[0];
+            right = new VariantTreeNode();
+            right.cutData(data, depth+1, lastCutDepth, numAnnotations);
+        } else {
+            final VariantDatum[] leftData = new VariantDatum[numLow];
+            final VariantDatum[] rightData = new VariantDatum[numHigh];
+            int leftIndex = 0;
+            int rightIndex = 0;
+            for( int iii = 0; iii < data.length; iii++ ) {
+                if( values[iii] < highPivot ) { leftData[leftIndex++] = data[iii]; }
+                if( values[iii] >= lowPivot ) { rightData[rightIndex++] = data[iii]; }
+            }
+
+            left = new VariantTreeNode();
+            right = new VariantTreeNode();
+            left.cutData(leftData, depth+1, depth, numAnnotations);
+            right.cutData(rightData, depth+1, depth, numAnnotations);
+        }
+    }
+
+}