Adding the variantrecalibration classes.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3459 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2014837f8a
commit
522dd7a5b2
|
|
@ -0,0 +1,237 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Feb 11, 2010
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*/
|
||||
|
||||
public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false)
|
||||
private int NUM_GAUSSIANS = 6;
|
||||
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed when clustering", required=false)
|
||||
private int NUM_ITERATIONS = 10;
|
||||
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
|
||||
private int MIN_VAR_IN_CLUSTER = 0;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false)
|
||||
private double WEIGHT_KNOWNS = 8.0;
|
||||
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
|
||||
private double WEIGHT_HAPMAP = 120.0;
|
||||
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
|
||||
private double WEIGHT_1000GENOMES = 12.0;
|
||||
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
|
||||
private double WEIGHT_MQ1 = 10.0;
|
||||
|
||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
//private int NUM_KNN = 2000;
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private ExpandingArrayList<String> annotationKeys;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private int maxAC = 0;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
final double annotationValues[] = new double[annotationKeys.size()];
|
||||
|
||||
// todo -- do we really need to support multiple tracks -- logic is cleaner without this case -- what's the use case?
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
int iii = 0;
|
||||
for( final String key : annotationKeys ) {
|
||||
annotationValues[iii++] = VariantGaussianMixtureModel.decodeAnnotation( key, vc );
|
||||
}
|
||||
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
|
||||
if( variantDatum.alleleCount > maxAC ) {
|
||||
maxAC = variantDatum.alleleCount;
|
||||
}
|
||||
|
||||
variantDatum.isKnown = false;
|
||||
variantDatum.weight = 1.0;
|
||||
|
||||
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
if( dbsnp != null ) {
|
||||
variantDatum.isKnown = true;
|
||||
variantDatum.weight = WEIGHT_KNOWNS;
|
||||
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
|
||||
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
|
||||
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
|
||||
}
|
||||
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
logger.info( "There are " + dataManager.numVariants + " variants and " + dataManager.numAnnotations + " annotations." );
|
||||
logger.info( "The annotations are: " + annotationKeys );
|
||||
|
||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
|
||||
// Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
|
||||
VariantGaussianMixtureModel theModel;
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
// break;
|
||||
default:
|
||||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||
}
|
||||
|
||||
theModel.run( CLUSTER_FILENAME );
|
||||
theModel.outputClusterReports( CLUSTER_FILENAME );
|
||||
|
||||
for( final String annotation : annotationKeys ) {
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
|
||||
System.out.println( rScriptCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
final Process p = Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
p.waitFor();
|
||||
} catch (InterruptedException e) {
|
||||
throw new StingException(e.getMessage());
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 2, 2010
|
||||
*/
|
||||
|
||||
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
||||
public void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFilename );
|
||||
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public class VariantDataManager {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||
|
||||
public final VariantDatum[] data;
|
||||
public final int numVariants;
|
||||
public final int numAnnotations;
|
||||
public final double[] meanVector;
|
||||
public final double[] varianceVector; // This is really the standard deviation
|
||||
public boolean isNormalized;
|
||||
public final ExpandingArrayList<String> annotationKeys;
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
|
||||
numVariants = dataList.size();
|
||||
data = dataList.toArray( new VariantDatum[numVariants] );
|
||||
if( numVariants <= 0 ) {
|
||||
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
|
||||
}
|
||||
if( _annotationKeys == null ) {
|
||||
numAnnotations = 0;
|
||||
meanVector = null;
|
||||
varianceVector = null;
|
||||
} else {
|
||||
numAnnotations = _annotationKeys.size();
|
||||
if( numAnnotations <= 0 ) {
|
||||
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
|
||||
}
|
||||
meanVector = new double[numAnnotations];
|
||||
varianceVector = new double[numAnnotations];
|
||||
isNormalized = false;
|
||||
}
|
||||
annotationKeys = _annotationKeys;
|
||||
}
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
|
||||
data = null;
|
||||
numVariants = 0;
|
||||
numAnnotations = annotationLines.size();
|
||||
meanVector = new double[numAnnotations];
|
||||
varianceVector = new double[numAnnotations];
|
||||
isNormalized = true;
|
||||
annotationKeys = new ExpandingArrayList<String>();
|
||||
|
||||
int jjj = 0;
|
||||
for( final String line : annotationLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
annotationKeys.add(vals[1]);
|
||||
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||
jjj++;
|
||||
}
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
boolean foundZeroVarianceAnnotation = false;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
final double theMean = mean(data, jjj);
|
||||
final double theSTD = standardDeviation(data, theMean, jjj);
|
||||
logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
if( theSTD < 1E-8 ) {
|
||||
foundZeroVarianceAnnotation = true;
|
||||
logger.warn("Zero variance is a problem: standard deviation = " + theSTD + " User must -exclude annotations with zero variance. Annotation = " + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)));
|
||||
} else if( theSTD < 1E-2 ) {
|
||||
logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
|
||||
}
|
||||
meanVector[jjj] = theMean;
|
||||
varianceVector[jjj] = theSTD;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
|
||||
}
|
||||
}
|
||||
isNormalized = true; // Each data point is now [ (x - mean) / standard deviation ]
|
||||
if( foundZeroVarianceAnnotation ) {
|
||||
throw new StingException("Found annotations with zero variance. They must be excluded before proceeding.");
|
||||
}
|
||||
}
|
||||
|
||||
private static double mean( final VariantDatum[] data, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += (data[iii].annotations[index] / ((double) numVars));
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
private static double standardDeviation( final VariantDatum[] data, final double mean, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += ( ((data[iii].annotations[index] - mean)*(data[iii].annotations[index] - mean)) / ((double) numVars));
|
||||
}
|
||||
return Math.sqrt( sum );
|
||||
}
|
||||
|
||||
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
outputFile.println("@!ANNOTATION," + annotationKeys.get(jjj) + "," + meanVector[jjj] + "," + varianceVector[jjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 24, 2010
|
||||
*/
|
||||
|
||||
public class VariantDatum {
|
||||
public double[] annotations;
|
||||
public boolean isTransition;
|
||||
public boolean isKnown;
|
||||
public double qual;
|
||||
public double weight;
|
||||
public int alleleCount;
|
||||
}
|
||||
|
|
@ -0,0 +1,730 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import Jama.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Random;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public final class VariantGaussianMixtureModel extends VariantOptimizationModel {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantGaussianMixtureModel.class);
|
||||
|
||||
public final VariantDataManager dataManager;
|
||||
private final int numGaussians;
|
||||
private final int numIterations;
|
||||
private final static long RANDOM_SEED = 91801305;
|
||||
private final static Random rand = new Random( RANDOM_SEED );
|
||||
private final double MIN_PROB = 1E-7;
|
||||
private final double MIN_SIGMA = 1E-5;
|
||||
private final double MIN_DETERMINANT = 1E-5;
|
||||
|
||||
private final double[][] mu; // The means for each cluster
|
||||
private final Matrix[] sigma; // The covariance matrix for each cluster
|
||||
private final Matrix[] sigmaInverse;
|
||||
private final double[] pCluster;
|
||||
private final double[] determinant;
|
||||
private final double[] alleleCountFactorArray;
|
||||
private final int minVarInCluster;
|
||||
|
||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
|
||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final int maxAC ) {
|
||||
dataManager = _dataManager;
|
||||
numGaussians = _numGaussians;
|
||||
numIterations = _numIterations;
|
||||
|
||||
mu = new double[numGaussians][];
|
||||
sigma = new Matrix[numGaussians];
|
||||
determinant = new double[numGaussians];
|
||||
pCluster = new double[numGaussians];
|
||||
alleleCountFactorArray = new double[maxAC + 1];
|
||||
minVarInCluster = _minVarInCluster;
|
||||
sigmaInverse = null; // This field isn't used during VariantOptimizer pass
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
|
||||
super( _targetTITV );
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> alleleCountLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines(new File( clusterFileName )) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) {
|
||||
alleleCountLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
clusterLines.add(line);
|
||||
} else {
|
||||
throw new StingException("Malformed input file: " + clusterFileName);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new StingException("Can not find input file: " + clusterFileName);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters
|
||||
numIterations = 0;
|
||||
minVarInCluster = 0;
|
||||
|
||||
// BUGBUG: move this parsing out of the constructor
|
||||
numGaussians = clusterLines.size();
|
||||
mu = new double[numGaussians][dataManager.numAnnotations];
|
||||
double sigmaVals[][][] = new double[numGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
|
||||
sigma = new Matrix[numGaussians];
|
||||
sigmaInverse = new Matrix[numGaussians];
|
||||
pCluster = new double[numGaussians];
|
||||
determinant = new double[numGaussians];
|
||||
|
||||
alleleCountFactorArray = new double[alleleCountLines.size() + 1];
|
||||
for( final String line : alleleCountLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
alleleCountFactorArray[Integer.parseInt(vals[1])] = Double.parseDouble(vals[2]);
|
||||
}
|
||||
|
||||
int kkk = 0;
|
||||
for( final String line : clusterLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
pCluster[kkk] = Double.parseDouble(vals[1]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]);
|
||||
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[2+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor;
|
||||
}
|
||||
}
|
||||
|
||||
sigma[kkk] = new Matrix(sigmaVals[kkk]);
|
||||
sigmaInverse[kkk] = sigma[kkk].inverse(); // Precompute all the inverses and determinants for use later
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
kkk++;
|
||||
}
|
||||
|
||||
logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
||||
}
|
||||
|
||||
public final void run( final String clusterFileName ) {
|
||||
|
||||
// Initialize the Allele Count prior
|
||||
generateAlleleCountPrior();
|
||||
|
||||
// Simply cluster with all the variants. The knowns have been given more weight than the novels
|
||||
logger.info("Clustering with " + dataManager.data.length + " variants.");
|
||||
createClusters( dataManager.data, 0, numGaussians, clusterFileName, false );
|
||||
}
|
||||
|
||||
private void generateAlleleCountPrior() {
|
||||
|
||||
final double[] acExpectation = new double[alleleCountFactorArray.length];
|
||||
final double[] acActual = new double[alleleCountFactorArray.length];
|
||||
final int[] alleleCount = new int[alleleCountFactorArray.length];
|
||||
|
||||
double sumExpectation = 0.0;
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acExpectation[iii] = 1.0 / ((double) iii);
|
||||
sumExpectation += acExpectation[iii];
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution
|
||||
alleleCount[iii] = 0;
|
||||
}
|
||||
for( final VariantDatum datum : dataManager.data ) {
|
||||
alleleCount[datum.alleleCount]++;
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution
|
||||
}
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed)
|
||||
}
|
||||
}
|
||||
|
||||
public final double getAlleleCountPrior( final int alleleCount ) {
|
||||
return alleleCountFactorArray[alleleCount];
|
||||
}
|
||||
|
||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
final double[][] pVarInCluster = new double[numGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian
|
||||
|
||||
// loop control variables:
|
||||
// iii - loop over data points
|
||||
// jjj - loop over annotations (features)
|
||||
// ppp - loop over annotations again (full rank covariance matrix)
|
||||
// kkk - loop over clusters
|
||||
// ttt - loop over EM iterations
|
||||
|
||||
// Set up the initial random Gaussians
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pCluster[kkk] = 1.0 / ((double) (stopCluster - startCluster));
|
||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
final double[][] randSigma = new double[numAnnotations][numAnnotations];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
randSigma[ppp][jjj] = 0.5 + 0.5 * rand.nextDouble(); // data has been normalized so sigmas are centered at 1.0
|
||||
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
|
||||
}
|
||||
}
|
||||
Matrix tmp = new Matrix(randSigma);
|
||||
tmp = tmp.times(tmp.transpose());
|
||||
sigma[kkk] = tmp;
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
}
|
||||
|
||||
// The EM loop
|
||||
for( int ttt = 0; ttt < numIterations; ttt++ ) {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Expectation Step (calculate the probability that each data point is in each cluster)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
evaluateGaussians( data, pVarInCluster, startCluster, stopCluster );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Maximization Step (move the clusters to maximize the sum probability of each data point)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Output cluster parameters at each iteration
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
printClusterParameters( clusterFileName + "." + (ttt+1) );
|
||||
|
||||
logger.info("Finished iteration " + (ttt+1) );
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Output the final cluster parameters
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
printClusterParameters( clusterFileName );
|
||||
}
|
||||
|
||||
private void printClusterParameters( final String clusterFileName ) {
|
||||
try {
|
||||
final PrintStream outputFile = new PrintStream( clusterFileName );
|
||||
dataManager.printClusterFileHeader( outputFile );
|
||||
for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) {
|
||||
outputFile.print("@!ALLELECOUNT,");
|
||||
outputFile.println(iii + "," + alleleCountFactorArray[iii]);
|
||||
}
|
||||
|
||||
final int numAnnotations = mu[0].length;
|
||||
final int numVariants = dataManager.numVariants;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
if( pCluster[kkk] * numVariants > minVarInCluster ) {
|
||||
final double sigmaVals[][] = sigma[kkk].getArray();
|
||||
outputFile.print("@!CLUSTER,");
|
||||
outputFile.print(pCluster[kkk] + ",");
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
outputFile.print(mu[kkk][jjj] + ",");
|
||||
}
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
outputFile.print(sigmaVals[jjj][ppp] + ",");
|
||||
}
|
||||
}
|
||||
outputFile.println(-1);
|
||||
}
|
||||
}
|
||||
outputFile.close();
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + clusterFileName );
|
||||
}
|
||||
}
|
||||
|
||||
public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) {
|
||||
double value;
|
||||
//if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
|
||||
// value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
|
||||
//}
|
||||
if( annotationKey.equals("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else {
|
||||
try {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
} catch( Exception e ) {
|
||||
throw new StingException("No double value detected for annotation = " + annotationKey +
|
||||
" in variant at " + vc.getLocation() + ", reported annotation value = " + vc.getAttribute( annotationKey ) );
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public final double evaluateVariant( final VariantContext vc ) {
|
||||
final double[] pVarInCluster = new double[numGaussians];
|
||||
final double[] annotations = new double[dataManager.numAnnotations];
|
||||
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc );
|
||||
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
||||
}
|
||||
|
||||
evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
|
||||
|
||||
double sum = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public final void outputClusterReports( final String outputPrefix ) {
|
||||
final double STD_STEP = 0.2;
|
||||
final double MAX_STD = 4.0;
|
||||
final double MIN_STD = -4.0;
|
||||
final int NUM_BINS = (int)Math.floor((Math.abs(MIN_STD) + Math.abs(MAX_STD)) / STD_STEP);
|
||||
final int numAnnotations = dataManager.numAnnotations;
|
||||
int totalCountsKnown = 0;
|
||||
int totalCountsNovel = 0;
|
||||
|
||||
final int counts[][][] = new int[numAnnotations][NUM_BINS][2];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
counts[jjj][iii][0] = 0;
|
||||
counts[jjj][iii][1] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for( VariantDatum datum : dataManager.data ) {
|
||||
final int isKnown = ( datum.isKnown ? 1 : 0 );
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP));
|
||||
if(histBin < 0) { histBin = 0; }
|
||||
if(histBin > NUM_BINS-1) { histBin = NUM_BINS-1; }
|
||||
if(histBin >= 0 && histBin <= NUM_BINS-1) {
|
||||
counts[jjj][histBin][isKnown]++;
|
||||
}
|
||||
}
|
||||
if( isKnown == 1 ) { totalCountsKnown++; }
|
||||
else { totalCountsNovel++; }
|
||||
}
|
||||
|
||||
int annIndex = 0;
|
||||
for( final String annotation : dataManager.annotationKeys ) {
|
||||
PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream( outputPrefix + "." + annotation + ".dat" );
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
|
||||
}
|
||||
|
||||
outputFile.println("annotationValue,knownDist,novelDist");
|
||||
|
||||
for( int iii = 0; iii < NUM_BINS; iii++ ) {
|
||||
final double annotationValue = (((double)iii * STD_STEP)+MIN_STD) * dataManager.varianceVector[annIndex] + dataManager.meanVector[annIndex];
|
||||
outputFile.println( annotationValue + "," + ( ((double)counts[annIndex][iii][1])/((double)totalCountsKnown) ) +
|
||||
"," + ( ((double)counts[annIndex][iii][0])/((double)totalCountsNovel) ));
|
||||
}
|
||||
|
||||
annIndex++;
|
||||
}
|
||||
|
||||
// BUGBUG: next output the actual cluster on top by integrating out every other annotation
|
||||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final boolean[] markedVariant = new boolean[numVariants];
|
||||
|
||||
final double MAX_QUAL = 100.0;
|
||||
final double QUAL_STEP = 0.1;
|
||||
final int NUM_BINS = (int) ((MAX_QUAL / QUAL_STEP) + 1);
|
||||
|
||||
final int numKnownAtCut[] = new int[NUM_BINS];
|
||||
final int numNovelAtCut[] = new int[NUM_BINS];
|
||||
final double knownTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double novelTiTvAtCut[] = new double[NUM_BINS];
|
||||
final double theCut[] = new double[NUM_BINS];
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
markedVariant[iii] = false;
|
||||
}
|
||||
|
||||
PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream( outputPrefix + ".dat" );
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
|
||||
}
|
||||
|
||||
int numKnown = 0;
|
||||
int numNovel = 0;
|
||||
int numKnownTi = 0;
|
||||
int numKnownTv = 0;
|
||||
int numNovelTi = 0;
|
||||
int numNovelTv = 0;
|
||||
boolean foundDesiredNumVariants = false;
|
||||
int jjj = 0;
|
||||
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||
for( double qCut = MAX_QUAL; qCut >= -0.001; qCut -= QUAL_STEP ) {
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
if( !markedVariant[iii] ) {
|
||||
if( data[iii].qual >= qCut ) {
|
||||
markedVariant[iii] = true;
|
||||
if( data[iii].isKnown ) { // known
|
||||
numKnown++;
|
||||
if( data[iii].isTransition ) { // transition
|
||||
numKnownTi++;
|
||||
} else { // transversion
|
||||
numKnownTv++;
|
||||
}
|
||||
} else { // novel
|
||||
numNovel++;
|
||||
if( data[iii].isTransition ) { // transition
|
||||
numNovelTi++;
|
||||
} else { // transversion
|
||||
numNovelTv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",qCut) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnown + " known variants");
|
||||
logger.info("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", ((double)numNovelTi) / ((double)numNovelTv)));
|
||||
foundDesiredNumVariants = true;
|
||||
}
|
||||
outputFile.println( qCut + "," + numKnown + "," + numNovel + "," +
|
||||
( numKnownTi == 0 || numKnownTv == 0 ? "NaN" : ( ((double)numKnownTi) / ((double)numKnownTv) ) ) + "," +
|
||||
( numNovelTi == 0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ) ));
|
||||
|
||||
numKnownAtCut[jjj] = numKnown;
|
||||
numNovelAtCut[jjj] = numNovel;
|
||||
knownTiTvAtCut[jjj] = ( numKnownTi == 0 || numKnownTv == 0 ? 0.0 : ( ((double)numKnownTi) / ((double)numKnownTv) ) );
|
||||
novelTiTvAtCut[jjj] = ( numNovelTi == 0 || numNovelTv == 0 ? 0.0 : ( ((double)numNovelTi) / ((double)numNovelTv) ) );
|
||||
theCut[jjj] = qCut;
|
||||
jjj++;
|
||||
}
|
||||
|
||||
// loop back through the data points looking for appropriate places to cut the data to get the target novel titv ratio
|
||||
int checkQuantile = 0;
|
||||
for( jjj = NUM_BINS-1; jjj >= 0; jjj-- ) {
|
||||
boolean foundCut = false;
|
||||
if( checkQuantile == 0 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.9 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 1 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.95 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 2 ) {
|
||||
if( novelTiTvAtCut[jjj] >= 0.98 * targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 3 ) {
|
||||
if( novelTiTvAtCut[jjj] >= targetTITV ) {
|
||||
foundCut = true;
|
||||
checkQuantile++;
|
||||
}
|
||||
} else if( checkQuantile == 4 ) {
|
||||
break; // break out
|
||||
}
|
||||
|
||||
if( foundCut ) {
|
||||
logger.info( "Keeping variants with QUAL >= " + String.format("%.1f",theCut[jjj]) + " results in a filtered set with: " );
|
||||
logger.info("\t" + numKnownAtCut[jjj] + " known variants");
|
||||
logger.info("\t" + numNovelAtCut[jjj] + " novel variants, (dbSNP rate = " +
|
||||
String.format("%.2f",((double) numKnownAtCut[jjj] * 100.0) / ((double) numKnownAtCut[jjj] + numNovelAtCut[jjj]) ) + "%)");
|
||||
logger.info("\t" + String.format("%.4f known Ti/Tv ratio", knownTiTvAtCut[jjj]));
|
||||
logger.info("\t" + String.format("%.4f novel Ti/Tv ratio", novelTiTvAtCut[jjj]));
|
||||
}
|
||||
}
|
||||
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
|
||||
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
double likelihood = 0.0;
|
||||
final double sigmaVals[][][] = new double[numGaussians][][];
|
||||
final double denom[] = new double[numGaussians];
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
sigmaVals[kkk] = sigma[kkk].inverse().getArray();
|
||||
denom[kkk] = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(Math.abs(determinant[kkk]), 0.5);
|
||||
}
|
||||
final double mult[] = new double[numAnnotations];
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mult[jjj] = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
mult[jjj] += (data[iii].annotations[ppp] - mu[kkk][ppp]) * sigmaVals[kkk][ppp][jjj];
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
|
||||
pVarInCluster[kkk][iii] = pCluster[kkk] * (Math.exp( -0.5 * sum ) / denom[kkk]);
|
||||
likelihood += pVarInCluster[kkk][iii];
|
||||
if(Double.isNaN(denom[kkk]) || determinant[kkk] < 0.5 * MIN_DETERMINANT) {
|
||||
System.out.println("det = " + sigma[kkk].det());
|
||||
System.out.println("denom = " + denom[kkk]);
|
||||
System.out.println("sumExp = " + sum);
|
||||
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
|
||||
System.out.println("=-------=");
|
||||
throw new StingException("Numerical Instability! determinant of covariance matrix <= 0. Try running with fewer clusters and then with better behaved annotation values.");
|
||||
}
|
||||
if(sum < 0.0) {
|
||||
System.out.println("det = " + sigma[kkk].det());
|
||||
System.out.println("denom = " + denom[kkk]);
|
||||
System.out.println("sumExp = " + sum);
|
||||
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
|
||||
System.out.println("=-------=");
|
||||
throw new StingException("Numerical Instability! covariance matrix no longer positive definite. Try running with fewer clusters and then with better behaved annotation values.");
|
||||
}
|
||||
if(pVarInCluster[kkk][iii] > 1.0) {
|
||||
System.out.println("det = " + sigma[kkk].det());
|
||||
System.out.println("denom = " + denom[kkk]);
|
||||
System.out.println("sumExp = " + sum);
|
||||
System.out.println("pVar = " + pVarInCluster[kkk][iii]);
|
||||
System.out.println("=-------=");
|
||||
throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values.");
|
||||
}
|
||||
|
||||
if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem
|
||||
pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble();
|
||||
}
|
||||
|
||||
sumProb += pVarInCluster[kkk][iii];
|
||||
}
|
||||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pVarInCluster[kkk][iii] /= sumProb;
|
||||
pVarInCluster[kkk][iii] *= data[iii].weight;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length));
|
||||
}
|
||||
|
||||
|
||||
private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) {
|
||||
|
||||
final int numAnnotations = annotations.length;
|
||||
final double mult[] = new double[numAnnotations];
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
final double sigmaVals[][] = sigmaInverse[kkk].getArray();
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mult[jjj] = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
mult[jjj] += (annotations[ppp] - mu[kkk][ppp]) * sigmaVals[ppp][jjj];
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
|
||||
final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5);
|
||||
pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void maximizeGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
final double sigmaVals[][][] = new double[numGaussians][numAnnotations][numAnnotations];
|
||||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
double sumPK = 0.0;
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
sumProb += prob;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += prob * data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] /= sumProb;
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-mu[kkk][jjj]) * (data[iii].annotations[ppp]-mu[kkk][ppp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA ) { // Very small numbers are a very big problem
|
||||
sigmaVals[kkk][jjj][ppp] = MIN_SIGMA;// + MIN_SIGMA * rand.nextDouble();
|
||||
}
|
||||
sigmaVals[kkk][ppp][jjj] = sigmaVals[kkk][jjj][ppp]; // sigma must be a symmetric matrix
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] /= sumProb;
|
||||
}
|
||||
}
|
||||
|
||||
sigma[kkk] = new Matrix(sigmaVals[kkk]);
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
|
||||
pCluster[kkk] = sumProb / numVariants;
|
||||
sumPK += pCluster[kkk];
|
||||
}
|
||||
|
||||
// ensure pCluster sums to one, it doesn't automatically due to very small numbers getting capped
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pCluster[kkk] /= sumPK;
|
||||
}
|
||||
|
||||
/*
|
||||
// Clean up extra big or extra small clusters
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
if( pCluster[kkk] > 0.45 ) { // This is a very large cluster compared to all the others
|
||||
System.out.println("!! Found very large cluster! Busting it up into smaller clusters.");
|
||||
final int numToReplace = 3;
|
||||
final Matrix savedSigma = sigma[kkk];
|
||||
for( int rrr = 0; rrr < numToReplace; rrr++ ) {
|
||||
// Find an example variant in the large cluster, drawn randomly
|
||||
int randVarIndex = -1;
|
||||
boolean foundVar = false;
|
||||
while( !foundVar ) {
|
||||
randVarIndex = rand.nextInt( numVariants );
|
||||
final double probK = pVarInCluster[kkk][randVarIndex];
|
||||
boolean inClusterK = true;
|
||||
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
|
||||
if( pVarInCluster[ccc][randVarIndex] > probK ) {
|
||||
inClusterK = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( inClusterK ) { foundVar = true; }
|
||||
}
|
||||
|
||||
// Find a place to put the example variant
|
||||
if( rrr == 0 ) { // Replace the big cluster that kicked this process off
|
||||
mu[kkk] = data[randVarIndex].annotations;
|
||||
pCluster[kkk] = 1.0 / ((double) (stopCluster-startCluster));
|
||||
} else { // Replace the cluster with the minimum prob
|
||||
double minProb = pCluster[startCluster];
|
||||
int minClusterIndex = startCluster;
|
||||
for( int ccc = startCluster; ccc < stopCluster; ccc++ ) {
|
||||
if( pCluster[ccc] < minProb ) {
|
||||
minProb = pCluster[ccc];
|
||||
minClusterIndex = ccc;
|
||||
}
|
||||
}
|
||||
mu[minClusterIndex] = data[randVarIndex].annotations;
|
||||
sigma[minClusterIndex] = savedSigma;
|
||||
//for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
// for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
// sigma[minClusterIndex].set(jjj, ppp, sigma[minClusterIndex].get(jjj, ppp) - 0.06 + 0.12 * rand.nextDouble());
|
||||
// }
|
||||
//}
|
||||
pCluster[minClusterIndex] = 0.5 / ((double) (stopCluster-startCluster));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
// Replace extremely small clusters with another random draw from the dataset
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) ||
|
||||
determinant[kkk] < MIN_DETERMINANT ) { // This is a very small cluster compared to all the others
|
||||
logger.info("!! Found singular cluster! Initializing a new random cluster.");
|
||||
pCluster[kkk] = 0.1 / ((double) (stopCluster-startCluster));
|
||||
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
final double[][] randSigma = new double[numAnnotations][numAnnotations];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
randSigma[ppp][jjj] = 0.50 + 0.5 * rand.nextDouble(); // data is normalized so this is centered at 1.0
|
||||
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix
|
||||
}
|
||||
}
|
||||
Matrix tmp = new Matrix(randSigma);
|
||||
tmp = tmp.times(tmp.transpose());
|
||||
sigma[kkk] = tmp;
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
}
|
||||
}
|
||||
|
||||
// renormalize pCluster since things might have changed due to the previous step
|
||||
sumPK = 0.0;
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
sumPK += pCluster[kkk];
|
||||
}
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pCluster[kkk] /= sumPK;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 1, 2010
|
||||
*/
|
||||
|
||||
public final class VariantNearestNeighborsModel extends VariantOptimizationModel {
|
||||
|
||||
private final int numKNN;
|
||||
|
||||
public VariantNearestNeighborsModel( VariantDataManager _dataManager, final double _targetTITV, final int _numKNN ) {
|
||||
super( _targetTITV );
|
||||
//dataManager = _dataManager;
|
||||
numKNN = _numKNN;
|
||||
}
|
||||
|
||||
public void run( final String outputPrefix ) {
|
||||
|
||||
throw new StingException( "Nearest Neighbors model hasn't been updated yet." );
|
||||
/*
|
||||
final int numVariants = dataManager.numVariants;
|
||||
|
||||
final double[] pTrueVariant = new double[numVariants];
|
||||
|
||||
final VariantTree vTree = new VariantTree( numKNN );
|
||||
vTree.createTreeFromData( dataManager.data );
|
||||
|
||||
System.out.println("Finished creating the kd-tree.");
|
||||
|
||||
for(int iii = 0; iii < numVariants; iii++) {
|
||||
pTrueVariant[iii] = calcTruePositiveRateFromTITV( vTree.calcNeighborhoodTITV( dataManager.data[iii] ) );
|
||||
}
|
||||
|
||||
PrintStream outputFile;
|
||||
try {
|
||||
outputFile = new PrintStream( outputPrefix + ".knn.optimize" );
|
||||
} catch (Exception e) {
|
||||
throw new StingException( "Unable to create output file: " + outputPrefix + ".knn.optimize" );
|
||||
}
|
||||
for(int iii = 0; iii < numVariants; iii++) {
|
||||
outputFile.print(String.format("%.4f",pTrueVariant[iii]) + ",");
|
||||
outputFile.println( (dataManager.data[iii].isTransition ? 1 : 0)
|
||||
+ "," + (dataManager.data[iii].isKnown? 1 : 0));
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public interface VariantOptimizationInterface {
|
||||
public void run( String outputPrefix );
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public abstract class VariantOptimizationModel implements VariantOptimizationInterface {
|
||||
|
||||
public enum Model {
|
||||
GAUSSIAN_MIXTURE_MODEL,
|
||||
K_NEAREST_NEIGHBORS
|
||||
}
|
||||
|
||||
protected final double targetTITV;
|
||||
|
||||
public VariantOptimizationModel() {
|
||||
targetTITV = 0.0;
|
||||
}
|
||||
|
||||
public VariantOptimizationModel( final double _targetTITV ) {
|
||||
targetTITV = _targetTITV;
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromTITV( final double _titv ) {
|
||||
double titv = _titv;
|
||||
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
|
||||
if( titv < 0.5 ) { titv = 0.5; }
|
||||
return ( (titv - 0.5) / (targetTITV - 0.5) );
|
||||
//if( titv < 0.0 ) { titv = 0.0; }
|
||||
//return ( titv / targetTITV );
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) {
|
||||
|
||||
final double tprTarget = calcTruePositiveRateFromTITV( overallTITV );
|
||||
double novelTITV = _novelTITV;
|
||||
if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); }
|
||||
if( novelTITV < 0.5 ) { novelTITV = 0.5; }
|
||||
final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) );
|
||||
|
||||
return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,244 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.0;
|
||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
private int DESIRED_NUM_VARIANTS = 0;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
|
||||
private int KNOWN_QUAL_PRIOR = 9;
|
||||
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private int NOVEL_QUAL_PRIOR = 2;
|
||||
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
|
||||
private double QUALITY_SCALE_FACTOR = 50.0;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VariantGaussianMixtureModel theModel = null;
|
||||
private VCFWriter vcfWriter;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILENAME, BACKOFF_FACTOR );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
// break;
|
||||
default:
|
||||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||
}
|
||||
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_KEY); // copied from VariantsToVCF
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.DEPTH_KEY);
|
||||
ALLOWED_FORMAT_FIELDS.add(VCFGenotypeRecord.GENOTYPE_POSTERIORS_TRIPLET_KEY);
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
|
||||
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
|
||||
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if( rod.getRecordType().equals(VCFRecord.class) ) {
|
||||
final VCFReader reader = new VCFReader(rod.getFile());
|
||||
final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
|
||||
samples.addAll(vcfSamples);
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
if( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
|
||||
final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
|
||||
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
|
||||
variantDatum.isKnown = dbsnp != null;
|
||||
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes
|
||||
|
||||
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
|
||||
final double knownPrior = ( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
|
||||
final double pTrue = theModel.evaluateVariant( vc ) * acPrior * knownPrior;
|
||||
|
||||
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
|
||||
mapList.add( variantDatum );
|
||||
|
||||
vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() );
|
||||
vcf.setQual( variantDatum.qual );
|
||||
vcf.setFilterString(VCFRecord.PASSES_FILTERS);
|
||||
vcfWriter.addRecord( vcf );
|
||||
|
||||
} else { // not a SNP or is filtered so just dump it out to the VCF file
|
||||
vcfWriter.addRecord( vcf );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
vcfWriter.close();
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||
System.out.println( rScriptCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
} catch ( IOException e ) {
|
||||
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 24, 2010
|
||||
*/
|
||||
|
||||
public class VariantTree {
|
||||
private final VariantTreeNode root;
|
||||
private final int numKNN;
|
||||
|
||||
public VariantTree( final int _numKNN ) {
|
||||
root = new VariantTreeNode();
|
||||
numKNN = _numKNN;
|
||||
}
|
||||
|
||||
public void createTreeFromData( VariantDatum[] data ) {
|
||||
root.cutData( data, 0, 0, data[0].annotations.length );
|
||||
}
|
||||
|
||||
public double calcNeighborhoodTITV( final VariantDatum variant ) {
|
||||
|
||||
double[] distances;
|
||||
|
||||
// Grab the subset of points that are approximately near this point
|
||||
final VariantDatum[] data = getBin( variant.annotations, root );
|
||||
if( data.length < numKNN ) {
|
||||
throw new StingException( "Bin is too small. Should be > " + numKNN );
|
||||
}
|
||||
|
||||
// Find the X nearest points in the subset
|
||||
final double[] originalDistances = calcDistances( variant.annotations, data );
|
||||
distances = originalDistances.clone();
|
||||
quickSort( distances, 0, distances.length-1 ); // BUGBUG: distances.length or distances.length-1
|
||||
|
||||
final double minDistance = distances[numKNN - 1];
|
||||
|
||||
// Calculate probability of being true based on this set of SNPs
|
||||
int numTi = 0;
|
||||
int numTv = 0;
|
||||
for( int iii = 0; iii < distances.length; iii++ ) {
|
||||
if( originalDistances[iii] <= minDistance ) {
|
||||
if( data[iii].isTransition ) { numTi++; }
|
||||
else { numTv++; }
|
||||
}
|
||||
}
|
||||
|
||||
return ((double) numTi) / ((double) numTv);
|
||||
}
|
||||
|
||||
private VariantDatum[] getBin( final double[] variant, final VariantTreeNode node ) {
|
||||
if( node.variants != null ) {
|
||||
return node.variants;
|
||||
} else {
|
||||
if( variant[node.cutDim] < node.cutValue ) {
|
||||
return getBin( variant, node.left );
|
||||
} else {
|
||||
return getBin( variant, node.right );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private double[] calcDistances( final double[] variant, final VariantDatum[] data ) {
|
||||
final double[] distSquared = new double[data.length];
|
||||
int iii = 0;
|
||||
for( final VariantDatum variantDatum : data ) {
|
||||
distSquared[iii] = 0.0;
|
||||
int jjj = 0;
|
||||
for( final double value : variantDatum.annotations) {
|
||||
final double diff = variant[jjj] - value;
|
||||
distSquared[iii] += ( diff * diff );
|
||||
jjj++;
|
||||
}
|
||||
iii++;
|
||||
}
|
||||
|
||||
return distSquared;
|
||||
}
|
||||
|
||||
public static int partition(final double arr[], final int left, final int right)
|
||||
{
|
||||
int i = left, j = right;
|
||||
double tmp;
|
||||
final double pivot = arr[(left + right) / 2];
|
||||
|
||||
while (i <= j) {
|
||||
while (arr[i] < pivot) {
|
||||
i++;
|
||||
}
|
||||
while (arr[j] > pivot) {
|
||||
j--;
|
||||
}
|
||||
if (i <= j) {
|
||||
tmp = arr[i];
|
||||
arr[i] = arr[j];
|
||||
arr[j] = tmp;
|
||||
i++;
|
||||
j--;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
public static void quickSort(final double arr[], final int left, final int right) {
|
||||
final int index = partition(arr, left, right);
|
||||
if (left < index - 1) {
|
||||
quickSort(arr, left, index - 1);
|
||||
}
|
||||
if (index < right) {
|
||||
quickSort(arr, index, right);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 24, 2010
|
||||
*/
|
||||
|
||||
public class VariantTreeNode {
|
||||
|
||||
public VariantTreeNode left;
|
||||
public VariantTreeNode right;
|
||||
public int cutDim;
|
||||
public double cutValue;
|
||||
public VariantDatum[] variants;
|
||||
|
||||
private final int minBinSize = 8000; // BUGBUG: must be larger than number of kNN
|
||||
|
||||
public VariantTreeNode() {
|
||||
left = null;
|
||||
right = null;
|
||||
variants = null;
|
||||
cutDim = -1;
|
||||
cutValue = -1;
|
||||
}
|
||||
|
||||
public final void cutData( final VariantDatum[] data, final int depth, final int lastCutDepth, final int numAnnotations ) {
|
||||
|
||||
cutDim = depth % numAnnotations;
|
||||
|
||||
if( depth != lastCutDepth && (cutDim == (lastCutDepth % numAnnotations)) ) { // Base case: we've tried to cut on all the annotations
|
||||
variants = data;
|
||||
return;
|
||||
}
|
||||
|
||||
final double[] values = new double[data.length];
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
values[iii] = data[iii].annotations[cutDim];
|
||||
}
|
||||
|
||||
final double[] sortedValues = values.clone();
|
||||
VariantTree.quickSort( sortedValues, 0, values.length-1 ); // BUGBUG: values.length or values.length-1
|
||||
|
||||
final int lowPivotIndex = Math.round(0.40f * sortedValues.length);
|
||||
final int highPivotIndex = Math.round(0.60f * sortedValues.length);
|
||||
final double lowPivot = sortedValues[lowPivotIndex];
|
||||
final double highPivot = sortedValues[highPivotIndex];
|
||||
cutValue = highPivot;
|
||||
|
||||
int numLow = 0;
|
||||
int numHigh = 0;
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
if( values[iii] < highPivot ) { numLow++; }
|
||||
if( values[iii] >= lowPivot ) { numHigh++; }
|
||||
}
|
||||
|
||||
// If cutting here makes the bin too small then don't cut
|
||||
if( numLow < minBinSize || numHigh < minBinSize || (numLow == numHigh && numLow == data.length) ) {
|
||||
cutValue = sortedValues[0];
|
||||
right = new VariantTreeNode();
|
||||
right.cutData(data, depth+1, lastCutDepth, numAnnotations);
|
||||
} else {
|
||||
final VariantDatum[] leftData = new VariantDatum[numLow];
|
||||
final VariantDatum[] rightData = new VariantDatum[numHigh];
|
||||
int leftIndex = 0;
|
||||
int rightIndex = 0;
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
if( values[iii] < highPivot ) { leftData[leftIndex++] = data[iii]; }
|
||||
if( values[iii] >= lowPivot ) { rightData[rightIndex++] = data[iii]; }
|
||||
}
|
||||
|
||||
left = new VariantTreeNode();
|
||||
right = new VariantTreeNode();
|
||||
left.cutData(leftData, depth+1, depth, numAnnotations);
|
||||
right.cutData(rightData, depth+1, depth, numAnnotations);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
public class VariantRecalibrationWalkersPerformanceTest extends WalkerTest {
|
||||
static HashMap<String, String> paramsFiles = new HashMap<String, String>();
|
||||
|
||||
@Test
|
||||
public void testGenerateVariantClusters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "b39bde091254e90490b0d7bf9f0ef24a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T GenerateVariantClusters" +
|
||||
" -B input,VCF," + vcf +
|
||||
" -nG 6" +
|
||||
" -nI 5" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an SB" +
|
||||
" -clusterFile /dev/null",
|
||||
0, // just one output file
|
||||
new ArrayList<String>(0));
|
||||
List<File> result = executeTest("testGenerateVariantClusters", spec).getFirst();
|
||||
paramsFiles.put(vcf, result.get(0).getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "412bdb2eb4ca8f7ee9dfb39cda676c95" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
String md5 = entry.getValue();
|
||||
String paramsFile = paramsFiles.get(vcf);
|
||||
System.out.printf("PARAMS FOR %s is %s%n", vcf, paramsFile);
|
||||
if ( paramsFile != null ) {
|
||||
File file = createTempFile("cluster",".vcf");
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-R " + oneKGLocation + "reference/human_b36_both.fasta" +
|
||||
" --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" +
|
||||
" -T VariantRecalibrator" +
|
||||
" -B input,VCF," + vcf +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -output /dev/null" +
|
||||
" -clusterFile " + validationDataLocation + "clusterFile",
|
||||
0,
|
||||
new ArrayList<String>(0));
|
||||
|
||||
executeTest("testVariantRecalibrator", spec);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue