From 3a863d3e8ca8eb1b5eaae7ef50c0dce327f437c5 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Mon, 1 Mar 2010 15:26:18 +0000 Subject: [PATCH] Initial check in of VariantOptimizer in playground. There is a Gaussian Mixture Model version and a k-Nearest Neighbors version. There is still lots of work to do. Nobody should be using it yet. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2904 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantoptimizer/VariantDataManager.java | 63 ++++++ .../variantoptimizer/VariantDatum.java | 14 ++ .../VariantGaussianMixtureModel.java | 180 +++++++++++++++++ .../VariantNearestNeighborsModel.java | 32 +++ .../VariantOptimizationInterface.java | 11 ++ .../VariantOptimizationModel.java | 24 +++ .../variantoptimizer/VariantOptimizer.java | 183 ++++++++++++++++++ .../walkers/variantoptimizer/VariantTree.java | 119 ++++++++++++ .../variantoptimizer/VariantTreeNode.java | 79 ++++++++ 9 files changed, 705 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java new file mode 100755 index 000000000..dfd3c02c1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.utils.ExpandingArrayList; +import org.broadinstitute.sting.utils.StingException; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Feb 26, 2010 + */ + +public class VariantDataManager { + public final VariantDatum[] data; + public final int numVariants; + public final int numAnnotations; + public final double[] varianceVector; + public boolean isNormalized; + + public VariantDataManager( final ExpandingArrayList dataList ) { + numVariants = dataList.size(); + data = dataList.toArray( new VariantDatum[numVariants]); + if( numVariants <= 0 ) { + throw new StingException( "There are zero variants! (or possibly problem with integer overflow)" ); + } + numAnnotations = data[0].annotations.length; + if( numAnnotations <= 0 ) { + throw new StingException( "There are zero annotations! (or possibly problem with integer overflow)" ); + } + varianceVector = new double[numAnnotations]; + isNormalized = false; + } + + public void normalizeData() { + for( int iii = 0; iii < numAnnotations; iii++ ) { + final double theMean = mean(data, iii); + final double theVariance = variance(data, theMean, iii); + varianceVector[iii] = theVariance * theVariance; // BUGBUG: model's use this as sigma^2 instead of sigma + for( int jjj=0; jjj null pointer + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + randSigma[jjj] = dataManager.varianceVector[jjj] + ((1.0 + rand.nextDouble()) * 0.01 * dataManager.varianceVector[jjj]); + } + } + sigma[kkk] = randSigma; + } + + for( int ttt = 0; ttt < numIterations; ttt++ ) { + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Expectation Step (calculate the probability that each data point is in each cluster) + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + for( int iii = 0; iii < numVariants; iii++ ) { + double sumProb = 0.0; + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + //pVarInCluster[kkk][iii] = rand.nextDouble(); + double sum = 0.0; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) ) + / sigma[kkk][jjj]; + } + pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum ); + + if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem + pVarInCluster[kkk][iii] = MIN_PROB; + } + + sumProb += pVarInCluster[kkk][iii]; + } + + if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + pVarInCluster[kkk][iii] /= sumProb; + } + } + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Maximization Step (move the clusters to maximize the sum probability of each data point) + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] = 0.0; + sigma[kkk][jjj] = 0.0; + } + } + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + double sumProb = 0.0; + for( int iii = 0; iii < numVariants; iii++ ) { + final double prob = pVarInCluster[kkk][iii]; + sumProb += prob; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] += prob * data[iii].annotations[jjj]; + } + } + + if( sumProb > MIN_SUM_PROB ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + mu[kkk][jjj] /= sumProb; + } + } + + for( int iii = 0; iii < numVariants; iii++ ) { + final double prob = pVarInCluster[kkk][iii]; + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sigma[kkk][jjj] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]); + } + } + + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem + sigma[kkk][jjj] = MIN_PROB; + } + } + + if( sumProb > MIN_SUM_PROB ) { + for( int jjj = 0; jjj < numAnnotations; jjj++ ) { + sigma[kkk][jjj] /= sumProb; + } + } + + pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive + } + + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Evaluate the clusters using titv as an estimate of the true positive rate + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTi[kkk] = 0.0; + probTv[kkk] = 0.0; + } + for( int iii = 0; iii < numVariants; iii++ ) { + if( data[iii].isTransition ) { // transition + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTi[kkk] += pVarInCluster[kkk][iii]; + } + } else { // transversion + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + probTv[kkk] += pVarInCluster[kkk][iii]; + } + } + } + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] ); + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + for( int iii = 0; iii < numVariants; iii++ ) { + pTrueVariant[ttt][iii] = 0.0; + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + pTrueVariant[ttt][iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk]; + } + } + + + + System.out.println("Finished iteration " + (ttt+1) ); + } + + + return pTrueVariant; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java new file mode 100755 index 000000000..fdfcf30be --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantNearestNeighborsModel.java @@ -0,0 +1,32 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Mar 1, 2010 + */ + +public class VariantNearestNeighborsModel extends VariantOptimizationModel { + + public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV ) { + super( _dataManager, _targetTITV ); + } + + public double[][] run() { + + final int numVariants = dataManager.numVariants; + + final double[][] pTrueVariant = new double[1][numVariants]; + + final VariantTree vTree = new VariantTree( 2000 ); + vTree.createTreeFromData( dataManager.data ); + + System.out.println("Finished creating the kd-tree."); + + for(int iii = 0; iii < numVariants; iii++) { + pTrueVariant[0][iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) ); + } + + return pTrueVariant; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java new file mode 100755 index 000000000..934889a58 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationInterface.java @@ -0,0 +1,11 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Feb 26, 2010 + */ + +public interface VariantOptimizationInterface { + public double[][] run(); +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java new file mode 100755 index 000000000..82853a2fb --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java @@ -0,0 +1,24 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Feb 26, 2010 + */ + +public abstract class VariantOptimizationModel implements VariantOptimizationInterface { + protected final VariantDataManager dataManager; + protected final double targetTITV; + + public VariantOptimizationModel( VariantDataManager _dataManager, final double _targetTITV ) { + dataManager = _dataManager; + targetTITV = _targetTITV; + } + + public double calcTruePositiveRateFromTITV( double titv ) { + if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); } + if( titv < 0.5f ) { titv = 0.5f; } + + return ( (titv - 0.5f) / (targetTITV - 0.5f) ); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java new file mode 100755 index 000000000..359a6792f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -0,0 +1,183 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.ExpandingArrayList; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.io.PrintStream; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing a callset that is optimized for + * an expected transition / transversion ratio. + * + * @author rpoplin + * @since Feb 11, 2010 + * + * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing an optimized callset + */ + +public class VariantOptimizer extends RodWalker, ExpandingArrayList> { + + ///////////////////////////// + // Command Line Arguments + ///////////////////////////// + @Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio to optimize towards. (~~2.2 for whole genome experiments)", required=true) + public double TARGET_TITV = 2.2; + @Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false) + public boolean FILTER_OUTPUT = false; + + ///////////////////////////// + // Private Member Variables + ///////////////////////////// + private final ExpandingArrayList annotationKeys = new ExpandingArrayList(); + private boolean firstVariant = true; + private int numAnnotations = 0; + + //--------------------------------------------------------------------------------------------------------------- + // + // initialize + // + //--------------------------------------------------------------------------------------------------------------- + + public void initialize() { + + } + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + public ExpandingArrayList map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + + final ExpandingArrayList mapList = new ExpandingArrayList(); + + if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers + return mapList; + } + + double annotationValues[] = new double[numAnnotations]; + + for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) { + if( vc != null && vc.isSNP() ) { + if( firstVariant ) { // This is the first variant encountered so set up the list of annotations + annotationKeys.addAll( vc.getAttributes().keySet() ); + if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?? + if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); } + if( annotationKeys.contains("Dels") ) { annotationKeys.remove("Dels"); } + if( annotationKeys.contains("AN") ) { annotationKeys.remove("AN"); } + numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL") + annotationValues = new double[numAnnotations]; + firstVariant = false; + } + + //BUGBUG: for now only using the novel SNPs + if( vc.getAttribute("ID").equals(".") ) { + int iii = 0; + for( final String key : annotationKeys ) { + + double value = 0.0f; + try { + value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) ); + } catch( NumberFormatException e ) { + // do nothing, default value is 0.0f, annotations with zero variance will be ignored later + } + annotationValues[iii++] = value; + } + + // Variant quality ("QUAL") is not in the list of annotations + annotationValues[iii] = vc.getPhredScaledQual(); + + VariantDatum variantDatum = new VariantDatum(); + variantDatum.annotations = annotationValues; + variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; + variantDatum.isKnown = !vc.getAttribute("ID").equals("."); + variantDatum.isFiltered = vc.isFiltered(); + mapList.add( variantDatum ); + } + } + } + + return mapList; // This value isn't actually used for anything + } + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + public ExpandingArrayList reduceInit() { + return new ExpandingArrayList(); + } + + public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { + reduceSum.addAll( mapValue ); + return reduceSum; + } + + public void onTraversalDone( ExpandingArrayList reduceSum ) { + + final VariantDataManager dataManager = new VariantDataManager( reduceSum ); + reduceSum.clear(); // Don't need this ever again, clean up some memory + + logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations."); + logger.info( "The annotations are: " + annotationKeys + " and QUAL." ); + + dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] + + final VariantOptimizationModel gmm = new VariantGaussianMixtureModel( dataManager, TARGET_TITV ); + final double[][] p = gmm.run(); + final int numIterations = 16; + + try { + final PrintStream out = new PrintStream("gmm512x16norm.data"); + for(int iii = 0; iii < dataManager.numVariants; iii++) { + for( int ttt = 0; ttt < numIterations; ttt++ ) { + out.print(p[ttt][iii] + "\t"); + } + out.println((dataManager.data[iii].isTransition ? 1 : 0) + + "\t" + (dataManager.data[iii].isKnown? 1 : 0) + + "\t" + (dataManager.data[iii].isFiltered ? 1 : 0) + ); + } + + + } catch (Exception e) { + e.printStackTrace(); + System.exit(-1); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java new file mode 100755 index 000000000..952a064e3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTree.java @@ -0,0 +1,119 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +import org.broadinstitute.sting.utils.StingException; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Feb 24, 2010 + */ + +public class VariantTree { + private final VariantTreeNode root; + private final int numKNN; + + public VariantTree( final int _numKNN ) { + root = new VariantTreeNode(); + numKNN = _numKNN; + } + + public void createTreeFromData( VariantDatum[] data ) { + root.cutData( data, 0, 0, data[0].annotations.length ); + } + + public double calcNeighborhoodTITV( final VariantDatum variant ) { + + double[] distances; + + // Grab the subset of points that are approximately near this point + final VariantDatum[] data = getBin( variant.annotations, root ); + if( data.length < numKNN ) { + throw new StingException( "Bin is too small. Should be > " + numKNN ); + } + + // Find the X nearest points in the subset + final double[] originalDistances = calcDistances( variant.annotations, data ); + distances = originalDistances.clone(); + quickSort( distances, 0, distances.length-1 ); // BUGBUG: distances.length or distances.length-1 + + final double minDistance = distances[numKNN - 1]; + + // Calculate probability of being true based on this set of SNPs + int numTi = 0; + int numTv = 0; + for( int iii = 0; iii < distances.length; iii++ ) { + if( originalDistances[iii] <= minDistance ) { + if( data[iii].isTransition ) { numTi++; } + else { numTv++; } + } + } + + return ((double) numTi) / ((double) numTv); + } + + private VariantDatum[] getBin( final double[] variant, final VariantTreeNode node ) { + if( node.variants != null ) { + return node.variants; + } else { + if( variant[node.cutDim] < node.cutValue ) { + return getBin( variant, node.left ); + } else { + return getBin( variant, node.right ); + } + } + } + + private double[] calcDistances( final double[] variant, final VariantDatum[] data ) { + final double[] distSquared = new double[data.length]; + int iii = 0; + for( final VariantDatum variantDatum : data ) { + distSquared[iii] = 0.0f; + int jjj = 0; + for( final double value : variantDatum.annotations) { + final double diff = variant[jjj] - value; + distSquared[iii] += ( diff * diff ); + jjj++; + } + iii++; + } + + return distSquared; + } + + public static int partition(final double arr[], final int left, final int right) + { + int i = left, j = right; + double tmp; + final double pivot = arr[(left + right) / 2]; + + while (i <= j) { + while (arr[i] < pivot) { + i++; + } + while (arr[j] > pivot) { + j--; + } + if (i <= j) { + tmp = arr[i]; + arr[i] = arr[j]; + arr[j] = tmp; + i++; + j--; + } + } + + return i; + } + + public static void quickSort(final double arr[], final int left, final int right) { + final int index = partition(arr, left, right); + if (left < index - 1) { + quickSort(arr, left, index - 1); + } + if (index < right) { + quickSort(arr, index, right); + } + } + + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java new file mode 100755 index 000000000..b77505934 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantTreeNode.java @@ -0,0 +1,79 @@ +package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Feb 24, 2010 + */ + +public class VariantTreeNode { + + public VariantTreeNode left; + public VariantTreeNode right; + public int cutDim; + public double cutValue; + public VariantDatum[] variants; + + private final int minBinSize = 8000; // BUGBUG: must be bigger than KNN + + public VariantTreeNode() { + left = null; + right = null; + variants = null; + cutDim = -1; + cutValue = -1; + } + + public void cutData( final VariantDatum[] data, final int depth, final int lastCutDepth, final int numAnnotations ) { + + cutDim = depth % numAnnotations; + + if( depth != lastCutDepth && (cutDim == (lastCutDepth % numAnnotations)) ) { // Base case: we've tried to cut on all the annotations + variants = data; + return; + } + + final double[] values = new double[data.length]; + for( int iii = 0; iii < data.length; iii++ ) { + values[iii] = data[iii].annotations[cutDim]; + } + + final double[] sortedValues = values.clone(); + VariantTree.quickSort( sortedValues, 0, values.length-1 ); // BUGBUG: values.length or values.length-1 + + final int lowPivotIndex = Math.round(0.40f * sortedValues.length); + final int highPivotIndex = Math.round(0.60f * sortedValues.length); + final double lowPivot = sortedValues[lowPivotIndex]; + final double highPivot = sortedValues[highPivotIndex]; + cutValue = highPivot; + + int numLow = 0; + int numHigh = 0; + for( int iii = 0; iii < data.length; iii++ ) { + if( values[iii] < highPivot ) { numLow++; } + if( values[iii] >= lowPivot ) { numHigh++; } + } + + // If cutting here makes the bin too small then don't cut + if( numLow < minBinSize || numHigh < minBinSize || (numLow == numHigh && numLow == data.length) ) { + cutValue = sortedValues[0]; + right = new VariantTreeNode(); + right.cutData(data, depth+1, lastCutDepth, numAnnotations); + } else { + final VariantDatum[] leftData = new VariantDatum[numLow]; + final VariantDatum[] rightData = new VariantDatum[numHigh]; + int leftIndex = 0; + int rightIndex = 0; + for( int iii = 0; iii < data.length; iii++ ) { + if( values[iii] < highPivot ) { leftData[leftIndex++] = data[iii]; } + if( values[iii] >= lowPivot ) { rightData[rightIndex++] = data[iii]; } + } + + left = new VariantTreeNode(); + right = new VariantTreeNode(); + left.cutData(leftData, depth+1, depth, numAnnotations); + right.cutData(rightData, depth+1, depth, numAnnotations); + } + } + +}