reverting optimizer changes

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3112 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-04-02 16:59:13 +00:00
parent 687fd477ff
commit 642c969896
5 changed files with 84 additions and 53 deletions

View File

@ -53,16 +53,18 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=false)
private double TARGET_TITV = 2.1;
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
private double BACKOFF_FACTOR = 1.0;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true. Default is 30.0", required=false)
private double KNOWN_VAR_QUAL_PRIOR = 30.0;
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true. Setting to 0.0 means unused.", required=false)
private double KNOWN_VAR_QUAL_PRIOR = 0.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
@ -97,7 +99,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME, BACKOFF_FACTOR );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
@ -156,7 +158,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual(), variantDatum.isHet );
final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) );
if( variantDatum.isKnown ) {
if( variantDatum.isKnown && KNOWN_VAR_QUAL_PRIOR > 0.1 ) {
variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR;
} else {
variantDatum.qual = recalQual;

View File

@ -70,6 +70,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
private int[] trueNegGlobal;
private int[] falseNegGlobal;
private String sampleName = null;
private boolean multiSampleCalls = false;
//---------------------------------------------------------------------------------------------------------------
//
@ -82,10 +83,19 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
if( rod != null && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
if( rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject() instanceof RodVCF ) {
if( rod.getReferenceOrderedData().getIterator().hasNext() && rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject() instanceof RodVCF ) {
inputRodNames.add(rod.getName());
if( sampleName == null ) {
sampleName = ((RodVCF)rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject()).getSampleNames()[0]; // BUGBUG: single sample calls only for now
System.out.println("Adding " + rod.getName() + " to input RodVCF list.");
if( sampleName == null && !multiSampleCalls ) {
final String[] samples = ((RodVCF)rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject()).getSampleNames();
if( samples.length > 1 ) {
multiSampleCalls = true;
System.out.println("Found multi sample calls.");
} else {
sampleName = samples[0];
System.out.println("Found single sample calls.");
}
}
}
}
@ -119,17 +129,21 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
if( vc.isSNP() && !vc.isFiltered() ) {
if( !vc.getGenotype(sampleName).isNoCall() ) {
if( multiSampleCalls ) {
isInTruthSet = true;
if( !vc.getGenotype(sampleName).isHomRef() ) {
if( vc.isPolymorphic() ) {
isTrueVariant = true;
}
} else {
if( !vc.getGenotype(sampleName).isNoCall() ) {
isInTruthSet = true;
if( !vc.getGenotype(sampleName).isHomRef() ) {
isTrueVariant = true;
}
}
}
}
//if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
// isTrueVariant = true;
//}
}
}
}
@ -146,7 +160,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
variantDatum.qual = vc.getPhredScaledQual();
variantDatum.isTrueVariant = isTrueVariant;
mapList.add( new Pair<String,VariantDatum>(inputName, variantDatum) );
} else { // Either not in the call set at all, is a monomorphic call, or was filtered out
}
else if ( vc != null && vc.isFiltered() && vc.getFilters().contains("HARD_TO_VALIDATE") ) { // hard to validate sites are hard to validate so don't count them
}
else { // Either not in the call set at all, is a monomorphic call, or was filtered out
if( isTrueVariant ) { // ... but it is a true variant so this is a false negative call
falseNegGlobal[curveIndex]++;
} else { // ... and it is not a variant site so this is a true negative call

View File

@ -55,18 +55,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final double[][] mu; // The means for the clusters
private final double[][] sigma; // The variances for the clusters, sigma is really sigma^2
private final double[] pCluster;
private final boolean[] isHetCluster;
//private final boolean[] isHetCluster;
private final int[] numMaxClusterKnown;
private final int[] numMaxClusterNovel;
private final double[] clusterTITV;
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
private final int minVarInCluster;
private final double knownAlphaFactor;
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final double _knownAlphaFactor ) {
super( _targetTITV );
dataManager = _dataManager;
numGaussians = ( _numGaussians % 2 == 0 ? _numGaussians : _numGaussians + 1 );
@ -75,15 +76,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
mu = new double[numGaussians][];
sigma = new double[numGaussians][];
pCluster = new double[numGaussians];
isHetCluster = null;
//isHetCluster = null;
numMaxClusterKnown = new int[numGaussians];
numMaxClusterNovel = new int[numGaussians];
clusterTITV = new double[numGaussians];
clusterTruePositiveRate = new double[numGaussians];
minVarInCluster = _minVarInCluster;
knownAlphaFactor = _knownAlphaFactor;
}
public VariantGaussianMixtureModel( final String clusterFileName ) {
public VariantGaussianMixtureModel( final String clusterFileName, final double backOffGaussianFactor ) {
super( 0.0 );
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
@ -108,24 +110,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterNovel = null;
clusterTITV = null;
minVarInCluster = 0;
knownAlphaFactor = 0.0;
//BUGBUG: move this parsing out of the constructor
numGaussians = clusterLines.size();
mu = new double[numGaussians][dataManager.numAnnotations];
sigma = new double[numGaussians][dataManager.numAnnotations];
pCluster = new double[numGaussians];
isHetCluster = new boolean[numGaussians];
//isHetCluster = new boolean[numGaussians];
clusterTruePositiveRate = new double[numGaussians];
int kkk = 0;
for( String line : clusterLines ) {
final String[] vals = line.split(",");
isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1;
//isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1;
pCluster[kkk] = Double.parseDouble(vals[2]);
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[6]); //BUGBUG: #define these magic index numbers, very easy to make a mistake here
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
mu[kkk][jjj] = Double.parseDouble(vals[7+jjj]);
sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * 3; //BUGBUG: *1.3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points
sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * backOffGaussianFactor; //BUGBUG: *3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points
}
kkk++;
}
@ -156,7 +159,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
// This block of code is used to cluster with novels + 1.5x knowns mixed together
/*
VariantDatum[] data;
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
@ -183,12 +186,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
createClusters( data ); // Using a subset of the data
createClusters( data, 0, numGaussians ); // Using a subset of the data
System.out.println("Outputting cluster parameters...");
printClusters( clusterFileName );
//System.out.println("Applying clusters to all variants...");
//applyClusters( dataManager.data, outputPrefix ); // Using all the data
*/
@ -243,6 +244,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
*/
/*
// This block of code is to cluster het and hom calls separately, but mixing together knowns and novels
final VariantDatum[] dataHet = new VariantDatum[Math.min(numHet,MAX_VARS)];
final VariantDatum[] dataHom = new VariantDatum[Math.min(numHom,MAX_VARS)];
@ -288,11 +290,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
createClusters( dataHom, numGaussians / 2, numGaussians );
System.out.println("Outputting cluster parameters...");
printClusters( clusterFileName );
*/
}
/*
public final void createClusters( final VariantDatum[] data, int startCluster, int stopCluster ) {
final int numVariants = data.length;
@ -403,19 +405,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
clusterTITV[kkk] = probTi[kkk] / probTv[kkk];
if( probKnown[kkk] > 2000.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk] );
if( probKnown[kkk] > 600.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk], clusterTITV[kkk], knownAlphaFactor );
} else {
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] );
}
}
}
*/
// This cluster method doesn't make use of the differences between known and novel Ti/Tv ratios
/*
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster ) {
final int numVariants = data.length;
@ -507,7 +509,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] );
}
}
*/
private void printClusters( final String clusterFileName ) {
@ -568,9 +570,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sum = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( isHetCluster[kkk] == isHet ) {
//if( isHetCluster[kkk] == isHet ) {
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
}
//}
}
return sum;
@ -654,7 +656,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
/ sigma[kkk][jjj];
}
pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum );
if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
pVarInCluster[kkk][iii] = MIN_PROB;
}
@ -677,30 +679,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
double sumProb = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( isHetCluster[kkk] == isHet ) {
//if( isHetCluster[kkk] == isHet ) {
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) )
/ sigma[kkk][jjj];
}
//BUGBUG: pCluster[kkk] should be removed here?
pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
//pVarInCluster[kkk] = Math.exp( -0.5 * sum );
// BUGBUG: reverting to old version that didn't have pCluster[kkk]* here, this meant that the overfitting parameters changed meanings
//pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
pVarInCluster[kkk] = Math.exp( -0.5 * sum );
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
pVarInCluster[kkk] = MIN_PROB;
}
sumProb += pVarInCluster[kkk];
}
//}
}
if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
if( isHetCluster[kkk] == isHet ) {
//if( isHetCluster[kkk] == isHet ) {
pVarInCluster[kkk] /= sumProb;
}
//}
}
}
}
@ -828,4 +830,4 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
}

View File

@ -44,7 +44,8 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
targetTITV = _targetTITV;
}
public final double calcTruePositiveRateFromTITV( double titv ) {
public final double calcTruePositiveRateFromTITV( final double _titv ) {
double titv = _titv;
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
if( titv < 0.5 ) { titv = 0.5; }
return ( (titv - 0.5) / (targetTITV - 0.5) );
@ -52,9 +53,14 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
//return ( titv / targetTITV );
}
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, double novelTITV ) {
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) {
final double tprTarget = calcTruePositiveRateFromTITV( overallTITV );
double novelTITV = _novelTITV;
if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); }
if( novelTITV < 0.5 ) { novelTITV = 0.5; }
return ( (novelTITV - 0.5) / (knownTITV - 0.5) );
final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) );
return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget );
}
}

View File

@ -55,6 +55,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1;
@Argument(fullName="known_alpha_factor", shortName="kFactor", doc="Percentage of true positive rate that is due to difference between known and novel titv in a cluster as opposed to difference from target titv", required=false)
private double KNOWN_ALPHA_FACTOR = 0.0;
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@ -66,11 +68,11 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
private int NUM_GAUSSIANS = 100;
private int NUM_GAUSSIANS = 32;
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
private int NUM_ITERATIONS = 7;
private int NUM_ITERATIONS = 10;
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
private int MIN_VAR_IN_CLUSTER = 0;
private int MIN_VAR_IN_CLUSTER = 2000;
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
//private int NUM_KNN = 2000;
@ -160,6 +162,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
variantDatum.isHet = vc.getHetCount() > vc.getHomVarCount(); // BUGBUG: what to do here for multi sample calls?
mapList.add( variantDatum );
}
}
@ -197,7 +200,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
VariantOptimizationModel theModel;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, KNOWN_ALPHA_FACTOR );
break;
//case K_NEAREST_NEIGHBORS:
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );