Incremental update to VariantOptimizer. Refactored parts of the clustering code to make it more clear. More comments.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2922 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
073fdd8ec7
commit
b241e0915b
|
|
@ -0,0 +1,37 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 2, 2010
|
||||
*/
|
||||
|
||||
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
||||
public void createClusters( final VariantDatum[] data );
|
||||
public double[] applyClusters( final VariantDatum[] data );
|
||||
}
|
||||
|
|
@ -40,35 +40,38 @@ public class VariantDataManager {
|
|||
public final int numAnnotations;
|
||||
public final double[] varianceVector;
|
||||
public boolean isNormalized;
|
||||
private final ExpandingArrayList<String> annotationKeys;
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList ) {
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
|
||||
numVariants = dataList.size();
|
||||
data = dataList.toArray( new VariantDatum[numVariants]);
|
||||
if( numVariants <= 0 ) {
|
||||
throw new StingException( "There are zero variants! (or possibly problem with integer overflow)" );
|
||||
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
|
||||
}
|
||||
numAnnotations = data[0].annotations.length;
|
||||
if( numAnnotations <= 0 ) {
|
||||
throw new StingException( "There are zero annotations! (or possibly problem with integer overflow)" );
|
||||
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
|
||||
}
|
||||
varianceVector = new double[numAnnotations];
|
||||
isNormalized = false;
|
||||
annotationKeys = _annotationKeys;
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
for( int iii = 0; iii < numAnnotations; iii++ ) {
|
||||
final double theMean = mean(data, iii);
|
||||
final double theVariance = variance(data, theMean, iii);
|
||||
varianceVector[iii] = theVariance * theVariance; // BUGBUG: model's use this as sigma^2 instead of sigma
|
||||
final double theSTD = standardDeviation(data, theMean, iii);
|
||||
System.out.println( (iii == numAnnotations-1 ? "QUAL" : annotationKeys.get(iii)) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
varianceVector[iii] = theSTD * theSTD;
|
||||
for( int jjj=0; jjj<numVariants; jjj++ ) {
|
||||
data[jjj].annotations[iii] = ( data[jjj].annotations[iii] - theMean ) / theVariance;
|
||||
data[jjj].annotations[iii] = ( data[jjj].annotations[iii] - theMean ) / theSTD;
|
||||
}
|
||||
}
|
||||
isNormalized = true;
|
||||
isNormalized = true; // Each data point is now [ (x - mean) / standard deviation ]
|
||||
}
|
||||
|
||||
private static double mean( final VariantDatum[] data, final int index ) {
|
||||
double sum = 0.0f;
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += (data[iii].annotations[index] / ((double) numVars));
|
||||
|
|
@ -76,8 +79,8 @@ public class VariantDataManager {
|
|||
return sum;
|
||||
}
|
||||
|
||||
private static double variance( final VariantDatum[] data, final double mean, final int index ) {
|
||||
double sum = 0.0f;
|
||||
private static double standardDeviation( final VariantDatum[] data, final double mean, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += ( ((data[iii].annotations[index] - mean)*(data[iii].annotations[index] - mean)) / ((double) numVars));
|
||||
|
|
|
|||
|
|
@ -33,32 +33,54 @@ import java.util.Random;
|
|||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public class VariantGaussianMixtureModel extends VariantOptimizationModel {
|
||||
public class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
||||
|
||||
private final int numGaussians = 512;
|
||||
private final int numGaussians = 128;
|
||||
private final long RANDOM_SEED = 91801305;
|
||||
private final double MIN_PROB = 1E-30;
|
||||
private final double MIN_SUM_PROB = 1E-20;
|
||||
|
||||
private final double[][] mu = new double[numGaussians][];
|
||||
private final double[][] sigma = new double[numGaussians][];
|
||||
private final double[] pCluster = new double[numGaussians];
|
||||
final double[] clusterTruePositiveRate = new double[numGaussians];
|
||||
|
||||
public VariantGaussianMixtureModel( VariantDataManager _dataManager, final double _targetTITV ) {
|
||||
super( _dataManager, _targetTITV );
|
||||
}
|
||||
|
||||
public double[][] run() {
|
||||
public double[] run() {
|
||||
|
||||
final int numVariants = dataManager.numVariants;
|
||||
final int numAnnotations = dataManager.numAnnotations;
|
||||
final VariantDatum[] data = dataManager.data;
|
||||
final int numIterations = 16;
|
||||
// Create the subset of the data to cluster with
|
||||
int numSubset = 0;
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
if( !datum.isKnown ) {
|
||||
numSubset++;
|
||||
}
|
||||
}
|
||||
final VariantDatum[] data = new VariantDatum[numSubset];
|
||||
int iii = 0;
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
if( !datum.isKnown ) {
|
||||
data[iii++] = datum;
|
||||
}
|
||||
}
|
||||
|
||||
final double[][] pTrueVariant = new double[numIterations][numVariants];
|
||||
final double[] pCluster = new double[numGaussians];
|
||||
System.out.println("Clustering with " + numSubset + " variants...");
|
||||
createClusters( data ); // Using a subset of the data
|
||||
System.out.println("Applying clusters to all variants...");
|
||||
return applyClusters( dataManager.data ); // Using all the data
|
||||
}
|
||||
|
||||
public final void createClusters( final VariantDatum[] data ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
final int numIterations = 3;
|
||||
|
||||
final double[][] pVarInCluster = new double[numGaussians][numVariants];
|
||||
final double[] probTi = new double[numGaussians];
|
||||
final double[] probTv = new double[numGaussians];
|
||||
final double[] clusterTruePositiveRate = new double[numGaussians];
|
||||
final double[][] pVarInCluster = new double[numGaussians][numVariants];
|
||||
final double[][] mu = new double[numGaussians][];
|
||||
final double[][] sigma = new double[numGaussians][];
|
||||
final Random rand = new Random( RANDOM_SEED );
|
||||
|
||||
|
||||
|
|
@ -68,13 +90,18 @@ public class VariantGaussianMixtureModel extends VariantOptimizationModel {
|
|||
// kkk - loop over clusters
|
||||
// ttt - loop over EM iterations
|
||||
|
||||
// Set up the initial random Gaussians
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
pCluster[kkk] = 1.0;
|
||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
//final double[] randMu = new double[numAnnotations];
|
||||
//for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
// randMu[jjj] = data[rand.nextInt(numVariants)].annotations[jjj];
|
||||
//}
|
||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations; //randMu;
|
||||
final double[] randSigma = new double[numAnnotations];
|
||||
if( dataManager.isNormalized ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
randSigma[jjj] = 0.9 + 0.2 * rand.nextDouble();
|
||||
randSigma[jjj] = 0.9 + 0.2 * rand.nextDouble();
|
||||
}
|
||||
} else { // BUGBUG: if not normalized then the varianceVector hasn't been calculated --> null pointer
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
|
|
@ -89,117 +116,146 @@ public class VariantGaussianMixtureModel extends VariantOptimizationModel {
|
|||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Expectation Step (calculate the probability that each data point is in each cluster)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
//pVarInCluster[kkk][iii] = rand.nextDouble();
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) )
|
||||
/ sigma[kkk][jjj];
|
||||
}
|
||||
pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||
|
||||
if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
pVarInCluster[kkk][iii] = MIN_PROB;
|
||||
}
|
||||
|
||||
sumProb += pVarInCluster[kkk][iii];
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
pVarInCluster[kkk][iii] /= sumProb;
|
||||
}
|
||||
}
|
||||
}
|
||||
evaluateGaussians( data, pVarInCluster );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Maximization Step (move the clusters to maximize the sum probability of each data point)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = 0.0;
|
||||
sigma[kkk][jjj] = 0.0;
|
||||
}
|
||||
}
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
sumProb += prob;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += prob * data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] /= sumProb;
|
||||
}
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sigma[kkk][jjj] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]);
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
sigma[kkk][jjj] = MIN_PROB;
|
||||
}
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sigma[kkk][jjj] /= sumProb;
|
||||
}
|
||||
}
|
||||
|
||||
pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Evaluate the clusters using titv as an estimate of the true positive rate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTi[kkk] = 0.0;
|
||||
probTv[kkk] = 0.0;
|
||||
}
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( data[iii].isTransition ) { // transition
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTi[kkk] += pVarInCluster[kkk][iii];
|
||||
}
|
||||
} else { // transversion
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTv[kkk] += pVarInCluster[kkk][iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
pTrueVariant[ttt][iii] = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
pTrueVariant[ttt][iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
maximizeGaussians( data, pVarInCluster );
|
||||
|
||||
System.out.println("Finished iteration " + (ttt+1) );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Evaluate the clusters using titv as an estimate of the true positive rate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
evaluateGaussians( data, pVarInCluster ); // One final evaluation because the Gaussians moved in the last maximization step
|
||||
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTi[kkk] = 0.0;
|
||||
probTv[kkk] = 0.0;
|
||||
}
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( data[iii].isTransition ) { // transition
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTi[kkk] += pVarInCluster[kkk][iii];
|
||||
}
|
||||
} else { // transversion
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
probTv[kkk] += pVarInCluster[kkk][iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( probTi[kkk] / probTv[kkk] );
|
||||
}
|
||||
}
|
||||
|
||||
public final double[] applyClusters( final VariantDatum[] data ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
|
||||
final double[] pTrueVariant = new double[numVariants];
|
||||
final double[][] pVarInCluster = new double[numGaussians][numVariants];
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Expectation Step (calculate the probability that each data point is in each cluster)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
evaluateGaussians( data, pVarInCluster );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
pTrueVariant[iii] = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
pTrueVariant[iii] += pVarInCluster[kkk][iii] * clusterTruePositiveRate[kkk];
|
||||
}
|
||||
}
|
||||
|
||||
return pTrueVariant;
|
||||
}
|
||||
|
||||
|
||||
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
||||
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += ( (data[iii].annotations[jjj] - mu[kkk][jjj]) * (data[iii].annotations[jjj] - mu[kkk][jjj]) )
|
||||
/ sigma[kkk][jjj];
|
||||
}
|
||||
pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||
|
||||
if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
pVarInCluster[kkk][iii] = MIN_PROB;
|
||||
}
|
||||
|
||||
sumProb += pVarInCluster[kkk][iii];
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
pVarInCluster[kkk][iii] /= sumProb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void maximizeGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = 0.0;
|
||||
sigma[kkk][jjj] = 0.0;
|
||||
}
|
||||
}
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
sumProb += prob;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += prob * data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] /= sumProb;
|
||||
}
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sigma[kkk][jjj] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[jjj]-mu[kkk][jjj]);
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
if( sigma[kkk][jjj] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
sigma[kkk][jjj] = MIN_PROB;
|
||||
}
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sigma[kkk][jjj] /= sumProb;
|
||||
}
|
||||
}
|
||||
|
||||
pCluster[kkk] = sumProb / numVariants; // BUGBUG: Experiment with this, want to keep many clusters alive
|
||||
// Perhaps replace the cluster with a new random draw once pCluster gets too small
|
||||
// and break up a large cluster with examples drawn from that cluster
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,11 +37,11 @@ public class VariantNearestNeighborsModel extends VariantOptimizationModel {
|
|||
super( _dataManager, _targetTITV );
|
||||
}
|
||||
|
||||
public double[][] run() {
|
||||
public double[] run() {
|
||||
|
||||
final int numVariants = dataManager.numVariants;
|
||||
|
||||
final double[][] pTrueVariant = new double[1][numVariants];
|
||||
final double[] pTrueVariant = new double[numVariants];
|
||||
|
||||
final VariantTree vTree = new VariantTree( 2000 );
|
||||
vTree.createTreeFromData( dataManager.data );
|
||||
|
|
@ -49,7 +49,7 @@ public class VariantNearestNeighborsModel extends VariantOptimizationModel {
|
|||
System.out.println("Finished creating the kd-tree.");
|
||||
|
||||
for(int iii = 0; iii < numVariants; iii++) {
|
||||
pTrueVariant[0][iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) );
|
||||
pTrueVariant[iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) );
|
||||
}
|
||||
|
||||
return pTrueVariant;
|
||||
|
|
|
|||
|
|
@ -32,5 +32,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
|||
*/
|
||||
|
||||
public interface VariantOptimizationInterface {
|
||||
public double[][] run();
|
||||
public double[] run();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,8 +42,9 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
|||
|
||||
public double calcTruePositiveRateFromTITV( double titv ) {
|
||||
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
|
||||
if( titv < 0.5f ) { titv = 0.5f; }
|
||||
|
||||
return ( (titv - 0.5f) / (targetTITV - 0.5f) );
|
||||
if( titv < 0.5 ) { titv = 0.5; }
|
||||
return ( (titv - 0.5) / (targetTITV - 0.5) );
|
||||
//if( titv < 0.0 ) { titv = 0.0; }
|
||||
//return ( titv / targetTITV );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,11 +51,15 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio to optimize towards. (~~2.2 for whole genome experiments)", required=true)
|
||||
public double TARGET_TITV = 2.2;
|
||||
@Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false)
|
||||
public boolean FILTER_OUTPUT = false;
|
||||
|
||||
@Argument(fullName = "target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.12;
|
||||
//@Argument(fullName = "filter_output", shortName="filter", doc="If specified the optimizer will not only update the QUAL field of the output VCF file but will also filter the variants", required=false)
|
||||
//private boolean FILTER_OUTPUT = false;
|
||||
@Argument(fullName = "ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_INPUT_FILTERS = false;
|
||||
@Argument(fullName = "exclude_annotation", shortName = "exclude", doc = "The names of the annotations which should be excluded from the calculations", required = false)
|
||||
private String[] EXCLUDED_ANNOTATIONS = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -90,46 +94,48 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
double annotationValues[] = new double[numAnnotations];
|
||||
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && vc.isSNP() ) {
|
||||
|
||||
if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) {
|
||||
if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
|
||||
annotationKeys.addAll( vc.getAttributes().keySet() );
|
||||
if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field??
|
||||
if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?
|
||||
if( annotationKeys.contains("DB") ) { annotationKeys.remove("DB"); }
|
||||
if( annotationKeys.contains("Dels") ) { annotationKeys.remove("Dels"); }
|
||||
if( annotationKeys.contains("AN") ) { annotationKeys.remove("AN"); }
|
||||
if( EXCLUDED_ANNOTATIONS != null ) {
|
||||
for( final String excludedAnnotation : EXCLUDED_ANNOTATIONS ) {
|
||||
if( annotationKeys.contains( excludedAnnotation ) ) { annotationKeys.remove( excludedAnnotation ); }
|
||||
}
|
||||
}
|
||||
numAnnotations = annotationKeys.size() + 1; // +1 for variant quality ("QUAL")
|
||||
annotationValues = new double[numAnnotations];
|
||||
firstVariant = false;
|
||||
}
|
||||
|
||||
//BUGBUG: for now only using the novel SNPs
|
||||
if( vc.getAttribute("ID").equals(".") ) {
|
||||
int iii = 0;
|
||||
for( final String key : annotationKeys ) {
|
||||
int iii = 0;
|
||||
for( final String key : annotationKeys ) {
|
||||
|
||||
double value = 0.0f;
|
||||
try {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) );
|
||||
} catch( NumberFormatException e ) {
|
||||
// do nothing, default value is 0.0f, annotations with zero variance will be ignored later
|
||||
}
|
||||
annotationValues[iii++] = value;
|
||||
double value = 0.0;
|
||||
try {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( key, "0.0" ) );
|
||||
} catch( NumberFormatException e ) {
|
||||
// do nothing, default value is 0.0,
|
||||
// BUGBUG: annotations with zero variance should be ignored
|
||||
}
|
||||
|
||||
// Variant quality ("QUAL") is not in the list of annotations
|
||||
annotationValues[iii] = vc.getPhredScaledQual();
|
||||
|
||||
VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
|
||||
variantDatum.isFiltered = vc.isFiltered();
|
||||
mapList.add( variantDatum );
|
||||
annotationValues[iii++] = value;
|
||||
}
|
||||
|
||||
// Variant quality ("QUAL") is not in the list of annotations, but is useful so add it here.
|
||||
annotationValues[iii] = vc.getPhredScaledQual();
|
||||
|
||||
VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
|
||||
variantDatum.isFiltered = vc.isFiltered(); // BUGBUG: This field won't be needed in the final version.
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
|
||||
return mapList; // This value isn't actually used for anything
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -149,7 +155,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum );
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations.");
|
||||
|
|
@ -158,19 +164,17 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
|
||||
final VariantOptimizationModel gmm = new VariantGaussianMixtureModel( dataManager, TARGET_TITV );
|
||||
final double[][] p = gmm.run();
|
||||
final int numIterations = 16;
|
||||
final double[] p = gmm.run();
|
||||
|
||||
// BUGBUG: Change to call a second ROD walker to output the new VCF file with new qual fields and filters
|
||||
// Intermediate cluster ROD can be analyzed to assess clustering performance
|
||||
try {
|
||||
final PrintStream out = new PrintStream("gmm512x16norm.data");
|
||||
final PrintStream out = new PrintStream("gmm128clusterNovel.data"); // Parse in Matlab to create performance plots
|
||||
for(int iii = 0; iii < dataManager.numVariants; iii++) {
|
||||
for( int ttt = 0; ttt < numIterations; ttt++ ) {
|
||||
out.print(p[ttt][iii] + "\t");
|
||||
}
|
||||
out.println((dataManager.data[iii].isTransition ? 1 : 0)
|
||||
out.print(p[iii] + "\t");
|
||||
out.println( (dataManager.data[iii].isTransition ? 1 : 0)
|
||||
+ "\t" + (dataManager.data[iii].isKnown? 1 : 0)
|
||||
+ "\t" + (dataManager.data[iii].isFiltered ? 1 : 0)
|
||||
);
|
||||
+ "\t" + (dataManager.data[iii].isFiltered ? 1 : 0) );
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -178,6 +182,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
e.printStackTrace();
|
||||
System.exit(-1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ public class VariantTree {
|
|||
final double[] distSquared = new double[data.length];
|
||||
int iii = 0;
|
||||
for( final VariantDatum variantDatum : data ) {
|
||||
distSquared[iii] = 0.0f;
|
||||
distSquared[iii] = 0.0;
|
||||
int jjj = 0;
|
||||
for( final double value : variantDatum.annotations) {
|
||||
final double diff = variant[jjj] - value;
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class VariantTreeNode {
|
|||
public double cutValue;
|
||||
public VariantDatum[] variants;
|
||||
|
||||
private final int minBinSize = 8000; // BUGBUG: must be bigger than KNN
|
||||
private final int minBinSize = 8000; // BUGBUG: must be larger than number of kNN
|
||||
|
||||
public VariantTreeNode() {
|
||||
left = null;
|
||||
|
|
|
|||
Loading…
Reference in New Issue