Variant Recalibrator MLE EM algorithm is moved over to variational Bayes EM in order to eliminate problems with singularities when clustering in higher than two dimensions. Because of this there is no longer a number of Gaussians parameter. Wiki will be updated shortly with new recommended command.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3704 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-07-01 18:51:07 +00:00
parent 4903d1fb4f
commit 255b036fb5
5 changed files with 252 additions and 142 deletions

View File

@ -65,22 +65,20 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
private String[] USE_ANNOTATIONS = null; private String[] USE_ANNOTATIONS = null;
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster"; private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false) @Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
private int NUM_GAUSSIANS = 4; private int MAX_GAUSSIANS = 30;
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false) @Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
private int MAX_ITERATIONS = 200; private int MAX_ITERATIONS = 200;
//@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int MIN_VAR_IN_CLUSTER = 0;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript"; private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/"; private String PATH_TO_RESOURCES = "R/";
@Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false) @Argument(fullName="weightNovels", shortName="weightNovels", doc="The weight for novel variants during clustering", required=false)
private double WEIGHT_NOVELS = 0.0; private double WEIGHT_NOVELS = 0.0;
@Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false) @Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for MQ2+ known variants during clustering", required=false)
private double WEIGHT_KNOWNS = 0.0; private double WEIGHT_KNOWNS = 0.0;
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false) @Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false)
private double WEIGHT_HAPMAP = 100.0; private double WEIGHT_HAPMAP = 1.0;
@Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false) @Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false)
private double WEIGHT_1000GENOMES = 1.0; private double WEIGHT_1000GENOMES = 1.0;
@Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false) @Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false)
@ -91,6 +89,10 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
private double STD_THRESHOLD = 6.0; private double STD_THRESHOLD = 6.0;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false) @Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
private double QUAL_THRESHOLD = 300.0; private double QUAL_THRESHOLD = 300.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
private double SHRINKAGE = 0.0001;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
private double DIRICHLET_PARAMETER = 1000.0;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
@ -104,8 +106,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
private ExpandingArrayList<String> annotationKeys; private ExpandingArrayList<String> annotationKeys;
private Set<String> ignoreInputFilterSet = null; private Set<String> ignoreInputFilterSet = null;
private int maxAC = 0; private int maxAC = 0;
private PrintStream outFile;
private final static boolean FXYZ_FILE = false; // Debug argument
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
@ -134,14 +134,6 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
if(!foundDBSNP) { if(!foundDBSNP) {
throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites."); throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
} }
if(FXYZ_FILE) {
try {
outFile = new PrintStream("variants.filtered.xyz");
} catch(Exception e) {
throw new StingException("Can't create file!");
}
}
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -166,7 +158,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) { if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
int iii = 0; int iii = 0;
for( final String key : annotationKeys ) { for( final String key : annotationKeys ) {
annotationValues[iii++] = VariantGaussianMixtureModel.decodeAnnotation( key, vc ); annotationValues[iii++] = VariantGaussianMixtureModel.decodeAnnotation( key, vc, true );
} }
final VariantDatum variantDatum = new VariantDatum(); final VariantDatum variantDatum = new VariantDatum();
@ -179,18 +171,16 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
variantDatum.isKnown = false; variantDatum.isKnown = false;
variantDatum.weight = WEIGHT_NOVELS; variantDatum.weight = WEIGHT_NOVELS;
variantDatum.knownStatus = VariantDatum.NOVEL;
variantDatum.qual = vc.getPhredScaledQual(); variantDatum.qual = vc.getPhredScaledQual();
final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
if( dbsnp != null ) { if( dbsnp != null ) {
variantDatum.isKnown = true; variantDatum.isKnown = true;
variantDatum.weight = WEIGHT_KNOWNS; variantDatum.weight = WEIGHT_KNOWNS;
variantDatum.knownStatus = VariantDatum.KNOWN_MQ0;
if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; variantDatum.knownStatus = VariantDatum.KNOWN_HAPMAP; } if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; }
else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; variantDatum.knownStatus = VariantDatum.KNOWN_1KG; } else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; }
else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; variantDatum.knownStatus = VariantDatum.KNOWN_MQ1; } else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; }
} }
mapList.add( variantDatum ); mapList.add( variantDatum );
@ -226,18 +216,12 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
if(FXYZ_FILE) { // Debug output
for(final VariantDatum datum : dataManager.data) {
outFile.println(String.format("%f,%f,%f,%d,%f",datum.annotations[0], datum.annotations[1], datum.weight, datum.knownStatus, datum.qual));
}
outFile.close();
}
// Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it // Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
VariantGaussianMixtureModel theModel; VariantGaussianMixtureModel theModel;
switch (OPTIMIZATION_MODEL) { switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL: case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, MAX_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC, FORCE_INDEPENDENT, STD_THRESHOLD, QUAL_THRESHOLD ); theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, maxAC, FORCE_INDEPENDENT,
STD_THRESHOLD, QUAL_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
break; break;
//case K_NEAREST_NEIGHBORS: //case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN ); // theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );

View File

@ -38,11 +38,4 @@ public class VariantDatum {
public double qual; public double qual;
public double weight; public double weight;
public int alleleCount; public int alleleCount;
public int knownStatus;
public final static int NOVEL = -1;
public final static int KNOWN_MQ0 = 0;
public final static int KNOWN_MQ1 = 1;
public final static int KNOWN_1KG = 2;
public final static int KNOWN_HAPMAP = 3;
} }

View File

@ -51,13 +51,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
protected final static Logger logger = Logger.getLogger(VariantGaussianMixtureModel.class); protected final static Logger logger = Logger.getLogger(VariantGaussianMixtureModel.class);
public final VariantDataManager dataManager; public final VariantDataManager dataManager;
private final int numGaussians; private final int maxGaussians;
private final int maxIterations; private final int maxIterations;
private final static long RANDOM_SEED = 91801305; private final static long RANDOM_SEED = 91801305;
private final static Random rand = new Random( RANDOM_SEED ); private final static Random rand = new Random( RANDOM_SEED );
private final double MIN_SIGMA = 1E-10;
private final double MIN_DETERMINANT = 1E-3;
private final double MIN_PROB_CONVERGENCE = 1E-5; private final double MIN_PROB_CONVERGENCE = 1E-5;
private final double SHRINKAGE;
private final double DIRICHLET_PARAMETER;
private final boolean FORCE_INDEPENDENT_ANNOTATIONS; private final boolean FORCE_INDEPENDENT_ANNOTATIONS;
private final double[][] mu; // The means for each cluster private final double[][] mu; // The means for each cluster
@ -66,30 +68,42 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private double[] pClusterLog10; private double[] pClusterLog10;
private final double[] determinant; private final double[] determinant;
private final double[] alleleCountFactorArray; private final double[] alleleCountFactorArray;
private final int minVarInCluster;
private final double stdThreshold; private final double stdThreshold;
private final double qualThreshold; private final double qualThreshold;
private double[] empiricalMu;
private Matrix empiricalSigma;
private final double[] hyperParameter_a;
private final double[] hyperParameter_b;
private final double[] hyperParameter_lambda;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*"); private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _maxIterations, final int _minVarInCluster, public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _maxGaussians, final int _maxIterations,
final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold ) { final int maxAC, final boolean _forceIndependent, final double _stdThreshold, final double _qualThreshold,
final double _shrinkage, final double _dirichlet) {
dataManager = _dataManager; dataManager = _dataManager;
numGaussians = _numGaussians; maxGaussians = _maxGaussians;
maxIterations = _maxIterations; maxIterations = _maxIterations;
mu = new double[numGaussians][]; mu = new double[maxGaussians][];
sigma = new Matrix[numGaussians]; sigma = new Matrix[maxGaussians];
determinant = new double[numGaussians]; determinant = new double[maxGaussians];
pClusterLog10 = new double[numGaussians]; pClusterLog10 = new double[maxGaussians];
alleleCountFactorArray = new double[maxAC + 1]; alleleCountFactorArray = new double[maxAC + 1];
minVarInCluster = _minVarInCluster;
stdThreshold = _stdThreshold; stdThreshold = _stdThreshold;
qualThreshold = _qualThreshold; qualThreshold = _qualThreshold;
FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent; FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
sigmaInverse = null; // This field isn't used during VariantOptimizer pass hyperParameter_a = new double[maxGaussians];
hyperParameter_b = new double[maxGaussians];
hyperParameter_lambda = new double[maxGaussians];
sigmaInverse = null; // This field isn't used during GenerateVariantClusters pass
SHRINKAGE = _shrinkage;
DIRICHLET_PARAMETER = _dirichlet;
} }
public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) { public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) {
@ -115,21 +129,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
dataManager = new VariantDataManager( annotationLines ); dataManager = new VariantDataManager( annotationLines );
// Several of the clustering parameters aren't used the second time around in ApplyVariantClusters // Several of the clustering parameters aren't used the second time around in VariantRecalibrator.java
SHRINKAGE = 0;
DIRICHLET_PARAMETER = 0;
maxIterations = 0; maxIterations = 0;
minVarInCluster = 0;
stdThreshold = 0.0; stdThreshold = 0.0;
qualThreshold = 0.0; qualThreshold = 0.0;
FORCE_INDEPENDENT_ANNOTATIONS = false; FORCE_INDEPENDENT_ANNOTATIONS = false;
hyperParameter_a = null;
hyperParameter_b = null;
hyperParameter_lambda = null;
// BUGBUG: move this parsing out of the constructor // BUGBUG: move this parsing out of the constructor
numGaussians = clusterLines.size(); maxGaussians = clusterLines.size();
mu = new double[numGaussians][dataManager.numAnnotations]; mu = new double[maxGaussians][dataManager.numAnnotations];
final double sigmaVals[][][] = new double[numGaussians][dataManager.numAnnotations][dataManager.numAnnotations]; final double sigmaVals[][][] = new double[maxGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
sigma = new Matrix[numGaussians]; sigma = new Matrix[maxGaussians];
sigmaInverse = new Matrix[numGaussians]; sigmaInverse = new Matrix[maxGaussians];
pClusterLog10 = new double[numGaussians]; pClusterLog10 = new double[maxGaussians];
determinant = new double[numGaussians]; determinant = new double[maxGaussians];
alleleCountFactorArray = new double[alleleCountLines.size() + 1]; alleleCountFactorArray = new double[alleleCountLines.size() + 1];
for( final String line : alleleCountLines ) { for( final String line : alleleCountLines ) {
@ -154,7 +172,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
kkk++; kkk++;
} }
logger.info("Found " + numGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys); logger.info("Found " + maxGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
} }
public final void run( final String clusterFileName ) { public final void run( final String clusterFileName ) {
@ -219,13 +237,58 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight."); logger.info(" " + numZeroWeight + " variants were removed from clustering due to having zero clustering weight.");
logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value."); logger.info(" " + numOutlier + " variants were removed due to having annotations that were more than " + stdThreshold + " standard deviations away from the mean annotation value.");
logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ")."); logger.info(" " + numBadQual + " variants were removed because raw QUAL value was less than threshold (" + qualThreshold + ").");
createClusters( data, 0, numGaussians, clusterFileName );
generateEmpricalStats( data );
logger.info("Initializing using k-means...");
initializeUsingKMeans( data );
logger.info("... done!");
createClusters( data, 0, maxGaussians, clusterFileName );
// Simply cluster with all the variants. The knowns have been given more weight than the novels // Simply cluster with all the variants. The knowns have been given more weight than the novels
//logger.info("Clustering with " + dataManager.data.length + " variants."); //logger.info("Clustering with " + dataManager.data.length + " variants.");
//createClusters( dataManager.data, 0, numGaussians, clusterFileName ); //createClusters( dataManager.data, 0, numGaussians, clusterFileName );
} }
private void generateEmpricalStats( VariantDatum[] data ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
empiricalMu = new double[numAnnotations];
final double[][] sigmaVals = new double[numAnnotations][numAnnotations];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
empiricalMu[jjj] = 0.0;
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
sigmaVals[jjj][ppp] = 0.0;
}
}
for(int iii = 0; iii < numVariants; iii++) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
empiricalMu[jjj] += data[iii].annotations[jjj] / ((double) numVariants);
}
}
for(int iii = 0; iii < numVariants; iii++) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
if( jjj == ppp ) {
sigmaVals[jjj][ppp] = 1.0; //0.01 * numAnnotations;
} else {
sigmaVals[jjj][ppp] = 0.0;
}
//sigmaVals[jjj][ppp] += (data[iii].annotations[jjj]-empiricalMu[jjj]) * (data[iii].annotations[ppp]-empiricalMu[ppp]);
}
}
}
empiricalSigma = new Matrix(sigmaVals);
//empiricalSigma.timesEquals(1.0 / (Math.pow(maxGaussians, 2.0 / ((double) numAnnotations))));
}
private void generateAlleleCountPrior() { private void generateAlleleCountPrior() {
final double[] acExpectation = new double[alleleCountFactorArray.length]; final double[] acExpectation = new double[alleleCountFactorArray.length];
@ -256,12 +319,76 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return alleleCountFactorArray[alleleCount]; return alleleCountFactorArray[alleleCount];
} }
private void initializeUsingKMeans( final VariantDatum[] data ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
}
}
for( int ttt = 0; ttt < 20; ttt++ ) {
performKMeansIteration( data );
}
}
private void performKMeansIteration( final VariantDatum[] data ) {
final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length;
final int[] assignment = new int[numVariants];
for( int iii = 0; iii < numVariants; iii++ ) {
final VariantDatum datum = data[iii];
double minDistance = Double.MAX_VALUE;
int minCluster = -1;
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
double dist = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
}
if(dist < minDistance) {
minDistance = dist;
minCluster = kkk;
}
}
assignment[iii] = minCluster;
}
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = 0.0;
}
int numAssigned = 0;
for( int iii = 0; iii < numVariants; iii++ ) {
if(assignment[iii] == kkk) {
numAssigned++;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] += data[iii].annotations[jjj];
}
}
}
if(numAssigned != 0) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] /= (double) numAssigned;
}
} else {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
}
}
}
}
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) { public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName ) {
final int numVariants = data.length; final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length; final int numAnnotations = data[0].annotations.length;
final double[][] pVarInCluster = new double[numGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian final double[][] pVarInCluster = new double[maxGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian
// loop control variables: // loop control variables:
// iii - loop over data points // iii - loop over data points
@ -272,8 +399,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Set up the initial random Gaussians // Set up the initial random Gaussians
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
hyperParameter_a[kkk] = numAnnotations;
hyperParameter_b[kkk] = SHRINKAGE;
hyperParameter_lambda[kkk] = DIRICHLET_PARAMETER;
pClusterLog10[kkk] = Math.log10(1.0 / ((double) (stopCluster - startCluster))); pClusterLog10[kkk] = Math.log10(1.0 / ((double) (stopCluster - startCluster)));
mu[kkk] = data[rand.nextInt(numVariants)].annotations;
final double[][] randSigma = new double[numAnnotations][numAnnotations]; final double[][] randSigma = new double[numAnnotations][numAnnotations];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) { for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
@ -316,14 +445,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
///////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster ); maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Output cluster parameters at each iteration
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
printClusterParameters( clusterFileName + "." + ttt );
logger.info("Finished iteration " + ttt ); logger.info("Finished iteration " + ttt );
ttt++; ttt++;
if( currentLikelihood - previousLikelihood < MIN_PROB_CONVERGENCE) { if( Math.abs(currentLikelihood - previousLikelihood) < MIN_PROB_CONVERGENCE) {
logger.info("Convergence!"); logger.info("Convergence!");
break; break;
} }
@ -346,21 +470,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
final int numAnnotations = mu[0].length; final int numAnnotations = mu[0].length;
final int numVariants = dataManager.numVariants; for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
for( int kkk = 0; kkk < numGaussians; kkk++ ) { if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
if( Math.pow(10.0, pClusterLog10[kkk]) * numVariants > minVarInCluster ) {
final double sigmaVals[][] = sigma[kkk].getArray(); final double sigmaVals[][] = sigma[kkk].getArray();
outputFile.print("@!CLUSTER,"); outputFile.print("@!CLUSTER");
outputFile.print(Math.pow(10.0, pClusterLog10[kkk]) + ","); outputFile.print("," + Math.pow(10.0, pClusterLog10[kkk]));
for(int jjj = 0; jjj < numAnnotations; jjj++ ) { for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
outputFile.print(mu[kkk][jjj] + ","); outputFile.print("," + mu[kkk][jjj]);
} }
for(int jjj = 0; jjj < numAnnotations; jjj++ ) { for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
for(int ppp = 0; ppp < numAnnotations; ppp++ ) { for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
outputFile.print(sigmaVals[jjj][ppp] + ","); outputFile.print("," + (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) );
} }
} }
outputFile.println(-1); outputFile.println();
} }
} }
outputFile.close(); outputFile.close();
@ -369,12 +492,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) { public static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter ) {
double value; double value;
//if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) { //if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) {
// value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance // value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance
//} //}
if( annotationKey.equals("QUAL") ) { if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // HRun values must be jittered a bit to work in this GMM
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
value += -0.25 + 0.5 * rand.nextDouble();
} else if( annotationKey.equals("QUAL") ) {
value = vc.getPhredScaledQual(); value = vc.getPhredScaledQual();
} else { } else {
try { try {
@ -388,18 +514,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
public final double evaluateVariantLog10( final VariantContext vc ) { public final double evaluateVariantLog10( final VariantContext vc ) {
final double[] pVarInCluster = new double[numGaussians]; final double[] pVarInCluster = new double[maxGaussians];
final double[] annotations = new double[dataManager.numAnnotations]; final double[] annotations = new double[dataManager.numAnnotations];
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc ); final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, true );
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj]; annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
} }
evaluateGaussiansForSingleVariant( annotations, pVarInCluster ); evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
double sum = 0.0; double sum = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk]; sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
} }
@ -612,18 +738,32 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numAnnotations = data[0].annotations.length; final int numAnnotations = data[0].annotations.length;
double likelihood = 0.0; double likelihood = 0.0;
final double sigmaVals[][][] = new double[numGaussians][][]; final double sigmaVals[][][] = new double[maxGaussians][][];
final double denomLog10[] = new double[numGaussians]; final double denomLog10[] = new double[maxGaussians];
final double pVarInClusterLog10[] = new double[numGaussians]; final double pVarInClusterLog10[] = new double[maxGaussians];
double pVarInClusterReals[]; double pVarInClusterReals[];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
sigmaVals[kkk] = sigma[kkk].inverse().getArray(); sigmaVals[kkk] = sigma[kkk].inverse().getArray();
denomLog10[kkk] = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5)); for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
if( Double.isInfinite(denomLog10[kkk]) ) { for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
throw new StingException("Numerical Instability! Determinant value is too small: " + determinant[kkk] + sigmaVals[kkk][jjj][ppp] *= hyperParameter_a[kkk];
"Try running with fewer annotations and then with fewer Gaussians."); }
} }
double sum = 0.0;
for(int jjj = 1; jjj < numAnnotations; jjj++) {
sum += diGamma((hyperParameter_a[kkk] + 1.0 - jjj) / 2.0);
}
sum -= Math.log(determinant[kkk]);
sum += Math.log(2.0) * numAnnotations;
final double gamma = 0.5 * sum;
sum = 0.0;
for(int ccc = 0; ccc < maxGaussians; ccc++) {
sum += hyperParameter_lambda[ccc];
}
final double pi = diGamma(hyperParameter_lambda[kkk]) - diGamma(sum);
final double beta = (-1.0 * numAnnotations) / (2.0 * hyperParameter_b[kkk]);
denomLog10[kkk] = (pi / Math.log(10.0)) + (gamma / Math.log(10.0)) + (beta / Math.log(10.0));
} }
final double mult[] = new double[numAnnotations]; final double mult[] = new double[numAnnotations];
double sumWeight = 0.0; double sumWeight = 0.0;
@ -641,14 +781,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]); sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]);
} }
pVarInClusterLog10[kkk] = pClusterLog10[kkk] + (( -0.5 * sum )/Math.log(10.0)) - denomLog10[kkk]; pVarInClusterLog10[kkk] = (( -0.5 * sum )/Math.log(10.0)) + denomLog10[kkk];
final double pVar = Math.pow(10.0, pVarInClusterLog10[kkk]); final double pVar = Math.pow(10.0, pVarInClusterLog10[kkk]);
likelihood += pVar * data[iii].weight; likelihood += pVar * data[iii].weight;
if( pVarInClusterLog10[kkk] > 0.0 || Double.isNaN(pVarInClusterLog10[kkk]) || Double.isInfinite(pVarInClusterLog10[kkk]) ) { if( Double.isNaN(pVarInClusterLog10[kkk]) || Double.isInfinite(pVarInClusterLog10[kkk]) ) {
logger.warn("det = " + sigma[kkk].det()); logger.warn("det = " + sigma[kkk].det());
logger.warn("denom = " + denomLog10[kkk]); logger.warn("denom = " + denomLog10[kkk]);
logger.warn("sumExp = " + sum); logger.warn("sumExp = " + sum);
logger.warn("mixtureLog10 = " + pClusterLog10[kkk]);
logger.warn("pVar = " + pVar); logger.warn("pVar = " + pVar);
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) { for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
@ -673,7 +814,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
logger.info("Explained likelihood = " + String.format("%.5f",likelihood / sumWeight)); logger.info("explained likelihood = " + String.format("%.5f",likelihood / sumWeight));
return likelihood / sumWeight; return likelihood / sumWeight;
} }
@ -682,7 +823,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numAnnotations = annotations.length; final int numAnnotations = annotations.length;
final double mult[] = new double[numAnnotations]; final double mult[] = new double[numAnnotations];
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
final double sigmaVals[][] = sigmaInverse[kkk].getArray(); final double sigmaVals[][] = sigmaInverse[kkk].getArray();
double sum = 0.0; double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
@ -705,8 +846,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numVariants = data.length; final int numVariants = data.length;
final int numAnnotations = data[0].annotations.length; final int numAnnotations = data[0].annotations.length;
final double sigmaVals[][][] = new double[numGaussians][numAnnotations][numAnnotations]; final double sigmaVals[][][] = new double[maxGaussians][numAnnotations][numAnnotations];
final double meanVals[][] = new double[numGaussians][numAnnotations]; final double wishartVals[][] = new double[numAnnotations][numAnnotations];
final double meanVals[][] = new double[maxGaussians][numAnnotations];
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
@ -716,6 +858,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
} }
double sumPK = 0.0; double sumPK = 0.0;
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
double sumProb = 0.0; double sumProb = 0.0;
@ -728,69 +871,60 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
sumProb += prob; sumProb += prob;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
meanVals[kkk][jjj] += prob * data[iii].annotations[jjj]; meanVals[kkk][jjj] += prob * data[iii].annotations[jjj];
} }
} }
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
meanVals[kkk][jjj] /= sumProb; meanVals[kkk][jjj] = (meanVals[kkk][jjj] + SHRINKAGE * empiricalMu[jjj]) / (sumProb + SHRINKAGE);
}
final double shrinkageFactor = (SHRINKAGE * sumProb) / (SHRINKAGE + sumProb);
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
wishartVals[jjj][ppp] = shrinkageFactor * (meanVals[kkk][jjj] - empiricalMu[jjj]) * (meanVals[kkk][ppp] - empiricalMu[ppp]);
}
} }
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
final double prob = pVarInCluster[kkk][iii]; final double prob = pVarInCluster[kkk][iii];
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-meanVals[kkk][jjj]) * (data[iii].annotations[ppp]-meanVals[kkk][ppp]);
}
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
if( sigmaVals[kkk][jjj][ppp] < MIN_SIGMA && sigmaVals[kkk][jjj][ppp] > -MIN_SIGMA ) { // Very small numbers are a very big problem
logger.warn("The sigma values look exceptionally small.... Probably about to crash due to numeric instability.");
}
sigmaVals[kkk][ppp][jjj] = sigmaVals[kkk][jjj][ppp]; // sigma must be a symmetric matrix
}
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
sigmaVals[kkk][jjj][ppp] /= sumProb;
}
}
if( FORCE_INDEPENDENT_ANNOTATIONS ) {
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
for( int ppp = 0; ppp < numAnnotations; ppp++ ) { for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
if(jjj!=ppp) { sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-meanVals[kkk][jjj]) * (data[iii].annotations[ppp]-meanVals[kkk][ppp]);
sigmaVals[kkk][jjj][ppp] = 0.0;
}
} }
} }
} }
final Matrix tmpMatrix = new Matrix(sigmaVals[kkk]);
if( tmpMatrix.det() > MIN_DETERMINANT ) {
sigma[kkk] = new Matrix(sigmaVals[kkk]);
determinant[kkk] = sigma[kkk].det();
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { final Matrix tmpMatrix = empiricalSigma.plus(new Matrix(wishartVals).plus(new Matrix(sigmaVals[kkk])));
mu[kkk][jjj] = meanVals[kkk][jjj];
} sigma[kkk] = (Matrix)tmpMatrix.clone();
} else { determinant[kkk] = sigma[kkk].det();
logger.warn("Tried to create a covariance matrix with exceptionally small determinant.");
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mu[kkk][jjj] = meanVals[kkk][jjj];
} }
pClusterLog10[kkk] = sumProb; pClusterLog10[kkk] = sumProb;
sumPK += sumProb; sumPK += sumProb;
hyperParameter_a[kkk] = sumProb + numAnnotations;
hyperParameter_b[kkk] = sumProb + SHRINKAGE;
hyperParameter_lambda[kkk] = sumProb + DIRICHLET_PARAMETER;
} }
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
pClusterLog10[kkk] = Math.log10( pClusterLog10[kkk] / sumPK ); pClusterLog10[kkk] = Math.log10( pClusterLog10[kkk] / sumPK );
} }
pClusterLog10 = MathUtils.normalizeFromLog10( pClusterLog10, true ); pClusterLog10 = MathUtils.normalizeFromLog10( pClusterLog10, true );
} }
// from http://en.wikipedia.org/wiki/Digamma_function
// According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by:
private static double diGamma(final double x) {
return Math.log(x) - ( 1.0 / (2.0 * x) )
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
}
} }

View File

@ -64,7 +64,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false) @Argument(fullName="target_titv", shortName="titv", doc="The expected Ti/Tv ratio to display on optimization curve output figures. (~~2.1 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.1; private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false) @Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.4; private double BACKOFF_FACTOR = 1.3;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false) @Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0; private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) @Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
@ -76,7 +76,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false) @Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private int NOVEL_QUAL_PRIOR = 2; private int NOVEL_QUAL_PRIOR = 2;
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false) @Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
private double QUALITY_SCALE_FACTOR = 200.0; private double QUALITY_SCALE_FACTOR = 1600.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false) @Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer"; private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
@ -199,13 +199,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
mapList.add( variantDatum ); mapList.add( variantDatum );
vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() ); vcf.addInfoField("OQ", String.format("%.2f", ((Double)vc.getPhredScaledQual())));
vcf.setQual( variantDatum.qual ); vcf.setQual( variantDatum.qual );
vcf.setFilterString(VCFRecord.PASSES_FILTERS); vcf.setFilterString(VCFRecord.PASSES_FILTERS);
vcfWriter.addRecord( vcf ); vcfWriter.addRecord( vcf );
} else { // not a SNP or is filtered so just dump it out to the VCF file } else { // not a SNP or is filtered so just dump it out to the VCF file
vcfWriter.addRecord( vcf ); vcfWriter.addRecord( vcf );
} }
} }

View File

@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test @Test
public void testGenerateVariantClusters() { public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "9b7517ff1fd0fc23a2596acc82e2ed96" ); e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "72558d2c49bb94dc59e9d4146fe0bc05" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey(); String vcf = entry.getKey();
@ -24,7 +24,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -T GenerateVariantClusters" + " -T GenerateVariantClusters" +
" -B input,VCF," + vcf + " -B input,VCF," + vcf +
" -L 1:1-100,000,000" + " -L 1:1-100,000,000" +
" -nG 6" +
" --ignore_filter GATK_STANDARD" + " --ignore_filter GATK_STANDARD" +
" -an QD -an HRun -an SB" + " -an QD -an HRun -an SB" +
" -clusterFile %s", " -clusterFile %s",
@ -38,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test @Test
public void testVariantRecalibrator() { public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>(); HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "e0e3d959929aa3940a81c9926d1406e2" ); e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "d41c4326e589f1746278f1ed9815291a" );
for ( Map.Entry<String, String> entry : e.entrySet() ) { for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey(); String vcf = entry.getKey();