Adding the variantrecalibration classes.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3459 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-05-28 18:21:27 +00:00
parent 2014837f8a
commit 522dd7a5b2
12 changed files with 1926 additions and 0 deletions

View File

@ -0,0 +1,237 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.commandline.Argument;
import java.io.IOException;
import java.util.*;
/**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
*
* @author rpoplin
* @since Feb 11, 2010
*
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
*/
public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false)
private int NUM_GAUSSIANS = 6;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed when clustering", required=false)
private int NUM_ITERATIONS = 10;
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int MIN_VAR_IN_CLUSTER = 0;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false)
private double WEIGHT_KNOWNS = 8.0;
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
private double WEIGHT_HAPMAP = 120.0;
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
private double WEIGHT_1000GENOMES = 12.0;
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
private double WEIGHT_MQ1 = 10.0;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
/////////////////////////////
// Private Member Variables
/////////////////////////////
private ExpandingArrayList<String> annotationKeys;
private Set<String> ignoreInputFilterSet = null;
private int maxAC = 0;
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
}
boolean foundDBSNP = false;
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
foundDBSNP = true;
}
}
if(!foundDBSNP) {
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
return mapList;
}
final double annotationValues[] = new double[annotationKeys.size()];
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
int iii = 0;
for( final String key : annotationKeys ) {
annotationValues[iii++] = VariantGaussianMixtureModel.decodeAnnotation( key, vc );
}
final VariantDatum variantDatum = new VariantDatum();
variantDatum.annotations = annotationValues;
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
if( variantDatum.alleleCount > maxAC ) {
maxAC = variantDatum.alleleCount;
}
variantDatum.isKnown = false;
variantDatum.weight = 1.0;
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
if( dbsnp != null ) {
variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_KNOWNS;
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
}
mapList.add( variantDatum );
}
}
}
return mapList;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> reduceInit() {
return new ExpandingArrayList<VariantDatum>();
}
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
reduceSum.addAll( mapValue );
return reduceSum;
}
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
logger.info( "The annotations are: " + annotationKeys );
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
// Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
VariantGaussianMixtureModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
// break;
default:
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
theModel.run( CLUSTER_FILENAME );
theModel.outputClusterReports( CLUSTER_FILENAME );
for( final String annotation : annotationKeys ) {
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
final Process p = Runtime.getRuntime().exec( rScriptCommandLine );
p.waitFor();
} catch (InterruptedException e) {
throw new StingException(e.getMessage());
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
}
}
}

View File

@ -0,0 +1,37 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Mar 2, 2010
*/
public interface VariantClusteringModel extends VariantOptimizationInterface {
public void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFilename );
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
}

View File

@ -0,0 +1,140 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 26, 2010
*/
public class VariantDataManager {
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
public final VariantDatum[] data;
public final int numVariants;
public final int numAnnotations;
public final double[] meanVector;
public final double[] varianceVector; // This is really the standard deviation
public boolean isNormalized;
public final ExpandingArrayList<String> annotationKeys;
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
numVariants = dataList.size();
data = dataList.toArray( new VariantDatum[numVariants] );
if( numVariants <= 0 ) {
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
}
if( _annotationKeys == null ) {
numAnnotations = 0;
meanVector = null;
varianceVector = null;
} else {
numAnnotations = _annotationKeys.size();
if( numAnnotations <= 0 ) {
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
}
meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations];
isNormalized = false;
}
annotationKeys = _annotationKeys;
}
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
data = null;
numVariants = 0;
numAnnotations = annotationLines.size();
meanVector = new double[numAnnotations];
varianceVector = new double[numAnnotations];
isNormalized = true;
annotationKeys = new ExpandingArrayList<String>();
int jjj = 0;
for( final String line : annotationLines ) {
final String[] vals = line.split(",");
annotationKeys.add(vals[1]);
meanVector[jjj] = Double.parseDouble(vals[2]);
varianceVector[jjj] = Double.parseDouble(vals[3]);
jjj++;
}
}
public void normalizeData() {
boolean foundZeroVarianceAnnotation = false;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
final double theMean = mean(data, jjj);
final double theSTD = standardDeviation(data, theMean, jjj);
logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
if( theSTD < 1E-8 ) {
foundZeroVarianceAnnotation = true;
logger.warn("Zero variance is a problem: standard deviation = " + theSTD + " User must -exclude annotations with zero variance. Annotation = " + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)));
} else if( theSTD < 1E-2 ) {
logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
}
meanVector[jjj] = theMean;
varianceVector[jjj] = theSTD;
for( int iii = 0; iii < numVariants; iii++ ) {
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
}
}
isNormalized = true; // Each data point is now [ (x - mean) / standard deviation ]
if( foundZeroVarianceAnnotation ) {
throw new StingException("Found annotations with zero variance. They must be excluded before proceeding.");
}
}
private static double mean( final VariantDatum[] data, final int index ) {
double sum = 0.0;
final int numVars = data.length;
for( int iii = 0; iii < numVars; iii++ ) {
sum += (data[iii].annotations[index] / ((double) numVars));
}
return sum;
}
private static double standardDeviation( final VariantDatum[] data, final double mean, final int index ) {
double sum = 0.0;
final int numVars = data.length;
for( int iii = 0; iii < numVars; iii++ ) {
sum += ( ((data[iii].annotations[index] - mean)*(data[iii].annotations[index] - mean)) / ((double) numVars));
}
return Math.sqrt( sum );
}
public void printClusterFileHeader( PrintStream outputFile ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.println("@!ANNOTATION," + annotationKeys.get(jjj) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
}
}
}

View File

@ -0,0 +1,41 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 24, 2010
*/
public class VariantDatum {
public double[] annotations;
public boolean isTransition;
public boolean isKnown;
public double qual;
public double weight;
public int alleleCount;
}

View File

@ -0,0 +1,730 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.text.XReadLines;
import Jama.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.Random;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 26, 2010
*/
public final class VariantGaussianMixtureModel extends VariantOptimizationModel {
protected final static Logger logger = Logger.getLogger(VariantGaussianMixtureModel.class);
public final VariantDataManager dataManager;
private final int numGaussians;
private final int numIterations;
private final static long RANDOM_SEED = 91801305;
private final static Random rand = new Random( RANDOM_SEED );
private final double MIN_PROB = 1E-7;
private final double MIN_SIGMA = 1E-5;
private final double MIN_DETERMINANT = 1E-5;
private final double[][] mu; // The means for each cluster
private final Matrix[] sigma; // The covariance matrix for each cluster
private final Matrix[] sigmaInverse;
private final double[] pCluster;
private final double[] determinant;
private final double[] alleleCountFactorArray;
private final int minVarInCluster;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final int maxAC ) {
dataManager = _dataManager;
numGaussians = _numGaussians;
numIterations = _numIterations;
mu = new double[numGaussians][];
sigma = new Matrix[numGaussians];
determinant = new double[numGaussians];
pCluster = new double[numGaussians];
alleleCountFactorArray = new double[maxAC + 1];
minVarInCluster = _minVarInCluster;
sigmaInverse = null; // This field isn't used during VariantOptimizer pass
}
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
super( _targetTITV );
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
try {
for ( final String line : new XReadLines(new File( clusterFileName )) ) {
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
annotationLines.add(line);
} else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
alleleCountLines.add(line);
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
clusterLines.add(line);
} else {
throw new StingException("Malformed input file: " + clusterFileName);
}
}
} catch ( FileNotFoundException e ) {
throw new StingException("Can not find input file: " + clusterFileName);
}
dataManager = new VariantDataManager( annotationLines );
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters
numIterations = 0;
minVarInCluster = 0;
// BUGBUG: move this parsing out of the constructor
numGaussians = clusterLines.size();
mu = new double[numGaussians][dataManager.numAnnotations];
double sigmaVals[][][] = new double[numGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
sigma = new Matrix[numGaussians];
sigmaInverse = new Matrix[numGaussians];
pCluster = new double[numGaussians];
determinant = new double[numGaussians];
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
for( final String line : alleleCountLines ) {
final String[] vals = line.split(",");
alleleCountFactorArray[Integer.parseInt(vals[1])] = Double.parseDouble(vals[2]);
}
int kkk = 0;
for( final String line : clusterLines ) {
final String[] vals = line.split(",");
pCluster[kkk] = Double.parseDouble(vals[1]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]);
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[2+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor;
}
}
sigma[kkk] = new Matrix(sigmaVals[kkk]);
sigmaInverse[kkk] = sigma[kkk].inverse(); // Precompute all the inverses and determinants for use later
determinant[kkk] = sigma[kkk].det();
kkk++;
}
logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
}
public final void run( final String clusterFileName ) {
// Initialize the Allele Count prior
generateAlleleCountPrior();
// Simply cluster with all the variants. The knowns have been given more weight than the novels
logger.info("Clustering with " + dataManager.data.length + " variants.");
createClusters( dataManager.data, 0, numGaussians, clusterFileName, false );
}
private void generateAlleleCountPrior() {
final double[] acExpectation = new double[alleleCountFactorArray.length];
final double[] acActual = new double[alleleCountFactorArray.length];
final int[] alleleCount = new int[alleleCountFactorArray.length];
double sumExpectation = 0.0;
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acExpectation[iii] = 1.0 / ((double) iii);
sumExpectation += acExpectation[iii];
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
alleleCount[iii] = 0;
}
for( final VariantDatum datum : dataManager.data ) {
alleleCount[datum.alleleCount]++;
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution
}
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
}
}
public final double getAlleleCountPrior( final int alleleCount ) {
return alleleCountFactorArray[alleleCount];
}
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
final double[][] pVarInCluster = new double[numGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian
// loop control variables:
// iii - loop over data points
// jjj - loop over annotations (features)
// ppp - loop over annotations again (full rank covariance matrix)
// kkk - loop over clusters
// ttt - loop over EM iterations
// Set up the initial random Gaussians
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] = 1.0 / ((double) (stopCluster - startCluster));
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[][] randSigma = new double[numAnnotations][numAnnotations];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
randSigma[ppp][jjj] = 0.5 + 0.5 * rand.nextDouble(); // data has been normalized so sigmas are centered at 1.0
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
}
}
Matrix tmp = new Matrix(randSigma);
tmp = tmp.times(tmp.transpose());
sigma[kkk] = tmp;
determinant[kkk] = sigma[kkk].det();
}
// The EM loop
for( int ttt = 0; ttt < numIterations; ttt++ ) {
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Expectation Step (calculate the probability that each data point is in each cluster)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
evaluateGaussians( data, pVarInCluster, startCluster, stopCluster );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Maximization Step (move the clusters to maximize the sum probability of each data point)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Output cluster parameters at each iteration
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
printClusterParameters( clusterFileName + "." + (ttt+1) );
logger.info("Finished iteration " + (ttt+1) );
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Output the final cluster parameters
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
printClusterParameters( clusterFileName );
}
private void printClusterParameters( final String clusterFileName ) {
try {
final PrintStream outputFile = new PrintStream( clusterFileName );
dataManager.printClusterFileHeader( outputFile );
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
outputFile.print("@!ALLELECOUNT,");
outputFile.println(iii + "," + alleleCountFactorArray[iii]);
}
final int numAnnotations = mu[0].length;
final int numVariants = dataManager.numVariants;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( pCluster[kkk] * numVariants > minVarInCluster ) {
final double sigmaVals[][] = sigma[kkk].getArray();
outputFile.print("@!CLUSTER,");
outputFile.print(pCluster[kkk] + ",");
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.print(mu[kkk][jjj] + ",");
}
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
outputFile.print(sigmaVals[jjj][ppp] + ",");
}
}
outputFile.println(-1);
}
}
outputFile.close();
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + clusterFileName );
}
}
public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) {
double value;
//if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
// value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
//}
if( annotationKey.equals("QUAL") ) {
value = vc.getPhredScaledQual();
} else {
try {
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
} catch( Exception e ) {
throw new StingException("No double value detected for annotation = " + annotationKey +
" in variant at " + vc.getLocation() + ", reported annotation value = " + vc.getAttribute( annotationKey ) );
}
}
return value;
}
public final double evaluateVariant( final VariantContext vc ) {
final double[] pVarInCluster = new double[numGaussians];
final double[] annotations = new double[dataManager.numAnnotations];
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc );
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
}
evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
double sum = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
}
return sum;
}
public final void outputClusterReports( final String outputPrefix ) {
final double STD_STEP = 0.2;
final double MAX_STD = 4.0;
final double MIN_STD = -4.0;
final int NUM_BINS = (int)Math.floor((Math.abs(MIN_STD) + Math.abs(MAX_STD)) / STD_STEP);
final int numAnnotations = dataManager.numAnnotations;
int totalCountsKnown = 0;
int totalCountsNovel = 0;
final int counts[][][] = new int[numAnnotations][NUM_BINS][2];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int iii = 0; iii < NUM_BINS; iii++ ) {
counts[jjj][iii][0] = 0;
counts[jjj][iii][1] = 0;
}
}
for( VariantDatum datum : dataManager.data ) {
final int isKnown = ( datum.isKnown ? 1 : 0 );
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP));
if(histBin < 0) { histBin = 0; }
if(histBin > NUM_BINS-1) { histBin = NUM_BINS-1; }
if(histBin >= 0 && histBin <= NUM_BINS-1) {
counts[jjj][histBin][isKnown]++;
}
}
if( isKnown == 1 ) { totalCountsKnown++; }
else { totalCountsNovel++; }
}
int annIndex = 0;
for( final String annotation : dataManager.annotationKeys ) {
PrintStream outputFile;
try {
outputFile = new PrintStream( outputPrefix + "." + annotation + ".dat" );
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
}
outputFile.println("annotationValue,knownDist,novelDist");
for( int iii = 0; iii < NUM_BINS; iii++ ) {
final double annotationValue = (((double)iii * STD_STEP)+MIN_STD) * dataManager.varianceVector[annIndex] + dataManager.meanVector[annIndex];
outputFile.println( annotationValue + "," + ( ((double)counts[annIndex][iii][1])/((double)totalCountsKnown) ) +
"," + ( ((double)counts[annIndex][iii][0])/((double)totalCountsNovel) ));
}
annIndex++;
}
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
final int numVariants = data.length;
final boolean[] markedVariant = new boolean[numVariants];
final double MAX_QUAL = 100.0;
final double QUAL_STEP = 0.1;
final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1);
final int numKnownAtCut[] = new int[NUM_BINS];
final int numNovelAtCut[] = new int[NUM_BINS];
final double knownTiTvAtCut[] = new double[NUM_BINS];
final double novelTiTvAtCut[] = new double[NUM_BINS];
final double theCut[] = new double[NUM_BINS];
for( int iii = 0; iii < numVariants; iii++ ) {
markedVariant[iii] = false;
}
PrintStream outputFile;
try {
outputFile = new PrintStream( outputPrefix + ".dat" );
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
}
int numKnown = 0;
int numNovel = 0;
int numKnownTi = 0;
int numKnownTv = 0;
int numNovelTi = 0;
int numNovelTv = 0;
boolean foundDesiredNumVariants = false;
int jjj = 0;
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
for( double qCut = MAX_QUAL; qCut >= -0.001; qCut -= QUAL_STEP ) {
for( int iii = 0; iii < numVariants; iii++ ) {
if( !markedVariant[iii] ) {
if( data[iii].qual >= qCut ) {
markedVariant[iii] = true;
if( data[iii].isKnown ) { // known
numKnown++;
if( data[iii].isTransition ) { // transition
numKnownTi++;
} else { // transversion
numKnownTv++;
}
} else { // novel
numNovel++;
if( data[iii].isTransition ) { // transition
numNovelTi++;
} else { // transversion
numNovelTv++;
}
}
}
}
}
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " );
logger.info("\t" + numKnown + " known variants");
logger.info("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
foundDesiredNumVariants = true;
}
outputFile.println( qCut + "," + numKnown + "," + numNovel + "," +
( numKnownTi == 0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ) ));
numKnownAtCut[jjj] = numKnown;
numNovelAtCut[jjj] = numNovel;
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
theCut[jjj] = qCut;
jjj++;
}
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
int checkQuantile = 0;
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
boolean foundCut = false;
if( checkQuantile == 0 ) {
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 1 ) {
if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 2 ) {
if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 3 ) {
if( novelTiTvAtCut[jjj] >= targetTITV ) {
foundCut = true;
checkQuantile++;
}
} else if( checkQuantile == 4 ) {
break; // break out
}
if( foundCut ) {
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " );
logger.info("\t" + numKnownAtCut[jjj] + " known variants");
logger.info("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
}
}
outputFile.close();
}
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
final int numAnnotations = data[0].annotations.length;
double likelihood = 0.0;
final double sigmaVals[][][] = new double[numGaussians][][];
final double denom[] = new double[numGaussians];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
sigmaVals[kkk] = sigma[kkk].inverse().getArray();
denom[kkk] = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(Math.abs(determinant[kkk]), 0.5);
}
final double mult[] = new double[numAnnotations];
for( int iii = 0; iii < data.length; iii++ ) {
double sumProb = 0.0;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mult[jjj] = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
mult[jjj] += (data[iii].annotations[ppp] - mu[kkk][ppp]) * sigmaVals[kkk][ppp][jjj];
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]);
}
pVarInCluster[kkk][iii] = pCluster[kkk] * (Math.exp( -0.5 * sum ) / denom[kkk]);
likelihood += pVarInCluster[kkk][iii];
if(Double.isNaN(denom[kkk]) || determinant[kkk] < 0.5 * MIN_DETERMINANT) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! determinant of covariance matrix <= 0. Try running with fewer clusters and then with better behaved annotation values.");
}
if(sum < 0.0) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! covariance matrix no longer positive definite. Try running with fewer clusters and then with better behaved annotation values.");
}
if(pVarInCluster[kkk][iii] > 1.0) {
System.out.println("det = " + sigma[kkk].det());
System.out.println("denom = " + denom[kkk]);
System.out.println("sumExp = " + sum);
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
System.out.println("=-------=");
throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values.");
}
if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem
pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble();
}
sumProb += pVarInCluster[kkk][iii];
}
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pVarInCluster[kkk][iii] /= sumProb;
pVarInCluster[kkk][iii] *= data[iii].weight;
}
}
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length));
}
private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) {
final int numAnnotations = annotations.length;
final double mult[] = new double[numAnnotations];
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
final double sigmaVals[][] = sigmaInverse[kkk].getArray();
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mult[jjj] = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
mult[jjj] += (annotations[ppp] - mu[kkk][ppp]) * sigmaVals[ppp][jjj];
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
}
final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5);
pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom;
}
}
private void maximizeGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
final double sigmaVals[][][] = new double[numGaussians][numAnnotations][numAnnotations];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] = 0.0;
}
}
}
double sumPK = 0.0;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
double sumProb = 0.0;
for( int iii = 0; iii < numVariants; iii++ ) {
final double prob = pVarInCluster[kkk][iii];
sumProb += prob;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] += prob * data[iii].annotations[jjj];
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] /= sumProb;
}
for( int iii = 0; iii < numVariants; iii++ ) {
final double prob = pVarInCluster[kkk][iii];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[ppp]-mu[kkk][ppp]);
}
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA ) { // Very small numbers are a very big problem
sigmaVals[kkk][jjj][ppp] = MIN_SIGMA;// + MIN_SIGMA * rand.nextDouble();
}
sigmaVals[kkk][ppp][jjj] = sigmaVals[kkk][jjj][ppp]; // sigma must be a symmetric matrix
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] /= sumProb;
}
}
sigma[kkk] = new Matrix(sigmaVals[kkk]);
determinant[kkk] = sigma[kkk].det();
pCluster[kkk] = sumProb / numVariants;
sumPK += pCluster[kkk];
}
// ensure pCluster sums to one, it doesn't automatically due to very small numbers getting capped
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] /= sumPK;
}
/*
// Clean up extra big or extra small clusters
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
System.out.println("!! Found very large cluster! Busting it up into smaller clusters.");
final int numToReplace = 3;
final Matrix savedSigma = sigma[kkk];
for( int rrr = 0; rrr < numToReplace; rrr++ ) {
// Find an example variant in the large cluster, drawn randomly
int randVarIndex = -1;
boolean foundVar = false;
while( !foundVar ) {
randVarIndex = rand.nextInt( numVariants );
final double probK = pVarInCluster[kkk][randVarIndex];
boolean inClusterK = true;
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
if( pVarInCluster[ccc][randVarIndex] > probK ) {
inClusterK = false;
break;
}
}
if( inClusterK ) { foundVar = true; }
}
// Find a place to put the example variant
if( rrr == 0 ) { // Replace the big cluster that kicked this process off
mu[kkk] = data[randVarIndex].annotations;
pCluster[kkk] = 1.0 / ((double) (stopCluster-startCluster));
} else { // Replace the cluster with the minimum prob
double minProb = pCluster[startCluster];
int minClusterIndex = startCluster;
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
if( pCluster[ccc] < minProb ) {
minProb = pCluster[ccc];
minClusterIndex = ccc;
}
}
mu[minClusterIndex] = data[randVarIndex].annotations;
sigma[minClusterIndex] = savedSigma;
//for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
// for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
// sigma[minClusterIndex].set(jjj, ppp, sigma[minClusterIndex].get(jjj, ppp) - 0.06 + 0.12 * rand.nextDouble());
// }
//}
pCluster[minClusterIndex] = 0.5 / ((double) (stopCluster-startCluster));
}
}
}
}
*/
/*
// Replace extremely small clusters with another random draw from the dataset
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) ||
determinant[kkk] < MIN_DETERMINANT ) { // This is a very small cluster compared to all the others
logger.info("!! Found singular cluster! Initializing a new random cluster.");
pCluster[kkk] = 0.1 / ((double) (stopCluster-startCluster));
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[][] randSigma = new double[numAnnotations][numAnnotations];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
randSigma[ppp][jjj] = 0.50 + 0.5 * rand.nextDouble(); // data is normalized so this is centered at 1.0
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
}
}
Matrix tmp = new Matrix(randSigma);
tmp = tmp.times(tmp.transpose());
sigma[kkk] = tmp;
determinant[kkk] = sigma[kkk].det();
}
}
// renormalize pCluster since things might have changed due to the previous step
sumPK = 0.0;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
sumPK += pCluster[kkk];
}
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pCluster[kkk] /= sumPK;
}
*/
}
}

View File

@ -0,0 +1,78 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Mar 1, 2010
*/
public final class VariantNearestNeighborsModel extends VariantOptimizationModel {
private final int numKNN;
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
super( _targetTITV );
//dataManager = _dataManager;
numKNN = _numKNN;
}
public void run( final String outputPrefix ) {
throw new StingException( "Nearest Neighbors model hasn't been updated yet." );
/*
final int numVariants = dataManager.numVariants;
final double[] pTrueVariant = new double[numVariants];
final VariantTree vTree = new VariantTree( numKNN );
vTree.createTreeFromData( dataManager.data );
System.out.println("Finished creating the kd-tree.");
for(int iii = 0; iii < numVariants; iii++) {
pTrueVariant[iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) );
}
PrintStream outputFile;
try {
outputFile = new PrintStream( outputPrefix + ".knn.optimize" );
} catch (Exception e) {
throw new StingException( "Unable to create output file: " + outputPrefix + ".knn.optimize" );
}
for(int iii = 0; iii < numVariants; iii++) {
outputFile.print(String.format("%.4f",pTrueVariant[iii]) + ",");
outputFile.println( (dataManager.data[iii].isTransition ? 1 : 0)
+ "," + (dataManager.data[iii].isKnown? 1 : 0));
}
*/
}
}

View File

@ -0,0 +1,36 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 26, 2010
*/
public interface VariantOptimizationInterface {
public void run( String outputPrefix );
}

View File

@ -0,0 +1,70 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 26, 2010
*/
public abstract class VariantOptimizationModel implements VariantOptimizationInterface {
public enum Model {
GAUSSIAN_MIXTURE_MODEL,
K_NEAREST_NEIGHBORS
}
protected final double targetTITV;
public VariantOptimizationModel() {
targetTITV = 0.0;
}
public VariantOptimizationModel( final double _targetTITV ) {
targetTITV = _targetTITV;
}
public final double calcTruePositiveRateFromTITV( final double _titv ) {
double titv = _titv;
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
if( titv < 0.5 ) { titv = 0.5; }
return ( (titv - 0.5) / (targetTITV - 0.5) );
//if( titv < 0.0 ) { titv = 0.0; }
//return ( titv / targetTITV );
}
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) {
final double tprTarget = calcTruePositiveRateFromTITV( overallTITV );
double novelTITV = _novelTITV;
if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); }
if( novelTITV < 0.5 ) { novelTITV = 0.5; }
final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) );
return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget );
}
}

View File

@ -0,0 +1,244 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*
* @author rpoplin
* @since Mar 17, 2010
*
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*/
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.0;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
private int KNOWN_QUAL_PRIOR = 9;
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private int NOVEL_QUAL_PRIOR = 2;
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
private double QUALITY_SCALE_FACTOR = 50.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
/////////////////////////////
// Private Member Variables
/////////////////////////////
private VariantGaussianMixtureModel theModel = null;
private VCFWriter vcfWriter;
private Set<String> ignoreInputFilterSet = null;
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
if( IGNORE_INPUT_FILTERS != null ) {
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
}
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
// break;
default:
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY);
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
final TreeSet<String> samples = new TreeSet<String>();
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getRecordType().equals(VCFRecord.class) ) {
final VCFReader reader = new VCFReader(rod.getFile());
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
samples.addAll(vcfSamples);
reader.close();
}
}
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
boolean foundDBSNP = false;
for( final ReferenceOrderedDataSource source : dataSources ) {
final RMDTrack rod = source.getReferenceOrderedData();
if( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
foundDBSNP = true;
}
}
if(!foundDBSNP) {
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
return mapList;
}
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
variantDatum.isKnown = dbsnp != null;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
final double knownPrior = ( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
final double pTrue = theModel.evaluateVariant( vc ) * acPrior * knownPrior;
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
mapList.add( variantDatum );
vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() );
vcf.setQual( variantDatum.qual );
vcf.setFilterString(VCFRecord.PASSES_FILTERS);
vcfWriter.addRecord( vcf );
} else { // not a SNP or is filtered so just dump it out to the VCF file
vcfWriter.addRecord( vcf );
}
}
}
return mapList;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> reduceInit() {
return new ExpandingArrayList<VariantDatum>();
}
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
reduceSum.addAll( mapValue );
return reduceSum;
}
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
vcfWriter.close();
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
}
}

View File

@ -0,0 +1,144 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.utils.StingException;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 24, 2010
*/
public class VariantTree {
private final VariantTreeNode root;
private final int numKNN;
public VariantTree( final int _numKNN ) {
root = new VariantTreeNode();
numKNN = _numKNN;
}
public void createTreeFromData( VariantDatum[] data ) {
root.cutData( data, 0, 0, data[0].annotations.length );
}
public double calcNeighborhoodTITV( final VariantDatum variant ) {
double[] distances;
// Grab the subset of points that are approximately near this point
final VariantDatum[] data = getBin( variant.annotations, root );
if( data.length < numKNN ) {
throw new StingException( "Bin is too small. Should be > " + numKNN );
}
// Find the X nearest points in the subset
final double[] originalDistances = calcDistances( variant.annotations, data );
distances = originalDistances.clone();
quickSort( distances, 0, distances.length-1 ); // BUGBUG: distances.length or distances.length-1
final double minDistance = distances[numKNN - 1];
// Calculate probability of being true based on this set of SNPs
int numTi = 0;
int numTv = 0;
for( int iii = 0; iii < distances.length; iii++ ) {
if( originalDistances[iii] <= minDistance ) {
if( data[iii].isTransition ) { numTi++; }
else { numTv++; }
}
}
return ((double) numTi) / ((double) numTv);
}
private VariantDatum[] getBin( final double[] variant, final VariantTreeNode node ) {
if( node.variants != null ) {
return node.variants;
} else {
if( variant[node.cutDim] < node.cutValue ) {
return getBin( variant, node.left );
} else {
return getBin( variant, node.right );
}
}
}
private double[] calcDistances( final double[] variant, final VariantDatum[] data ) {
final double[] distSquared = new double[data.length];
int iii = 0;
for( final VariantDatum variantDatum : data ) {
distSquared[iii] = 0.0;
int jjj = 0;
for( final double value : variantDatum.annotations) {
final double diff = variant[jjj] - value;
distSquared[iii] += ( diff * diff );
jjj++;
}
iii++;
}
return distSquared;
}
public static int partition(final double arr[], final int left, final int right)
{
int i = left, j = right;
double tmp;
final double pivot = arr[(left + right) / 2];
while (i <= j) {
while (arr[i] < pivot) {
i++;
}
while (arr[j] > pivot) {
j--;
}
if (i <= j) {
tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
i++;
j--;
}
}
return i;
}
public static void quickSort(final double arr[], final int left, final int right) {
final int index = partition(arr, left, right);
if (left < index - 1) {
quickSort(arr, left, index - 1);
}
if (index < right) {
quickSort(arr, index, right);
}
}
}

View File

@ -0,0 +1,104 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Feb 24, 2010
*/
public class VariantTreeNode {
public VariantTreeNode left;
public VariantTreeNode right;
public int cutDim;
public double cutValue;
public VariantDatum[] variants;
private final int minBinSize = 8000; // BUGBUG: must be larger than number of kNN
public VariantTreeNode() {
left = null;
right = null;
variants = null;
cutDim = -1;
cutValue = -1;
}
public final void cutData( final VariantDatum[] data, final int depth, final int lastCutDepth, final int numAnnotations ) {
cutDim = depth % numAnnotations;
if( depth != lastCutDepth && (cutDim == (lastCutDepth % numAnnotations)) ) { // Base case: we've tried to cut on all the annotations
variants = data;
return;
}
final double[] values = new double[data.length];
for( int iii = 0; iii < data.length; iii++ ) {
values[iii] = data[iii].annotations[cutDim];
}
final double[] sortedValues = values.clone();
VariantTree.quickSort( sortedValues, 0, values.length-1 ); // BUGBUG: values.length or values.length-1
final int lowPivotIndex = Math.round(0.40f * sortedValues.length);
final int highPivotIndex = Math.round(0.60f * sortedValues.length);
final double lowPivot = sortedValues[lowPivotIndex];
final double highPivot = sortedValues[highPivotIndex];
cutValue = highPivot;
int numLow = 0;
int numHigh = 0;
for( int iii = 0; iii < data.length; iii++ ) {
if( values[iii] < highPivot ) { numLow++; }
if( values[iii] >= lowPivot ) { numHigh++; }
}
// If cutting here makes the bin too small then don't cut
if( numLow < minBinSize || numHigh < minBinSize || (numLow == numHigh && numLow == data.length) ) {
cutValue = sortedValues[0];
right = new VariantTreeNode();
right.cutData(data, depth+1, lastCutDepth, numAnnotations);
} else {
final VariantDatum[] leftData = new VariantDatum[numLow];
final VariantDatum[] rightData = new VariantDatum[numHigh];
int leftIndex = 0;
int rightIndex = 0;
for( int iii = 0; iii < data.length; iii++ ) {
if( values[iii] < highPivot ) { leftData[leftIndex++] = data[iii]; }
if( values[iii] >= lowPivot ) { rightData[rightIndex++] = data[iii]; }
}
left = new VariantTreeNode();
right = new VariantTreeNode();
left.cutData(leftData, depth+1, depth, numAnnotations);
right.cutData(rightData, depth+1, depth, numAnnotations);
}
}
}

View File

@ -0,0 +1,65 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.WalkerTest;
import org.junit.Test;
import java.util.*;
import java.io.File;
public class VariantRecalibrationWalkersPerformanceTest extends WalkerTest {
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b39bde091254e90490b0d7bf9f0ef24a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
String md5 = entry.getValue();
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T GenerateVariantClusters" +
" -B input,VCF," + vcf +
" -nG 6" +
" -nI 5" +
" --ignore_filter GATK_STANDARD" +
" -an QD -an HRun -an SB" +
" -clusterFile /dev/null",
0, // just one output file
new ArrayList<String>(0));
List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
paramsFiles.put(vcf, result.get(0).getAbsolutePath());
}
}
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "412bdb2eb4ca8f7ee9dfb39cda676c95" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
String md5 = entry.getValue();
String paramsFile = paramsFiles.get(vcf);
System.out.printf("PARAMS FOR %s is %s%n", vcf, paramsFile);
if ( paramsFile != null ) {
File file = createTempFile("cluster",".vcf");
WalkerTestSpec spec = new WalkerTestSpec(
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
" -T VariantRecalibrator" +
" -B input,VCF," + vcf +
" --ignore_filter GATK_STANDARD" +
" -output /dev/null" +
" -clusterFile " + validationDataLocation + "clusterFile",
0,
new ArrayList<String>(0));
executeTest("testVariantRecalibrator", spec);
}
}
}
}