reverting optimizer changes
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3112 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
687fd477ff
commit
642c969896
|
|
@ -53,16 +53,18 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
|||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by spreading out the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.0;
|
||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||
private int DESIRED_NUM_VARIANTS = 0;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true. Default is 30.0", required=false)
|
||||
private double KNOWN_VAR_QUAL_PRIOR = 30.0;
|
||||
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true. Setting to 0.0 means unused.", required=false)
|
||||
private double KNOWN_VAR_QUAL_PRIOR = 0.0;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
|
|
@ -97,7 +99,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
|||
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
|
||||
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME, BACKOFF_FACTOR );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
|
|
@ -156,7 +158,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
|||
final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual(), variantDatum.isHet );
|
||||
final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) );
|
||||
|
||||
if( variantDatum.isKnown ) {
|
||||
if( variantDatum.isKnown && KNOWN_VAR_QUAL_PRIOR > 0.1 ) {
|
||||
variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR;
|
||||
} else {
|
||||
variantDatum.qual = recalQual;
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
private int[] trueNegGlobal;
|
||||
private int[] falseNegGlobal;
|
||||
private String sampleName = null;
|
||||
private boolean multiSampleCalls = false;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -82,10 +83,19 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod != null && !rod.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||
if( rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject() instanceof RodVCF ) {
|
||||
if( rod.getReferenceOrderedData().getIterator().hasNext() && rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject() instanceof RodVCF ) {
|
||||
inputRodNames.add(rod.getName());
|
||||
if( sampleName == null ) {
|
||||
sampleName = ((RodVCF)rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject()).getSampleNames()[0]; // BUGBUG: single sample calls only for now
|
||||
System.out.println("Adding " + rod.getName() + " to input RodVCF list.");
|
||||
if( sampleName == null && !multiSampleCalls ) {
|
||||
final String[] samples = ((RodVCF)rod.getReferenceOrderedData().getIterator().next().getUnderlyingObject()).getSampleNames();
|
||||
|
||||
if( samples.length > 1 ) {
|
||||
multiSampleCalls = true;
|
||||
System.out.println("Found multi sample calls.");
|
||||
} else {
|
||||
sampleName = samples[0];
|
||||
System.out.println("Found single sample calls.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -119,17 +129,21 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
|
||||
if( vc != null && vc.getName().toUpperCase().startsWith("TRUTH") ) {
|
||||
if( vc.isSNP() && !vc.isFiltered() ) {
|
||||
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
||||
if( multiSampleCalls ) {
|
||||
isInTruthSet = true;
|
||||
|
||||
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
||||
if( vc.isPolymorphic() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
} else {
|
||||
if( !vc.getGenotype(sampleName).isNoCall() ) {
|
||||
isInTruthSet = true;
|
||||
|
||||
if( !vc.getGenotype(sampleName).isHomRef() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//if( vc.isPolymorphic() ) { //BUGBUG: I don't think this is the right thing to do here, there are many polymorphic sites in the truth data because there are many samples
|
||||
// isTrueVariant = true;
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -146,7 +160,11 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
|
|||
variantDatum.qual = vc.getPhredScaledQual();
|
||||
variantDatum.isTrueVariant = isTrueVariant;
|
||||
mapList.add( new Pair<String,VariantDatum>(inputName, variantDatum) );
|
||||
} else { // Either not in the call set at all, is a monomorphic call, or was filtered out
|
||||
}
|
||||
else if ( vc != null && vc.isFiltered() && vc.getFilters().contains("HARD_TO_VALIDATE") ) { // hard to validate sites are hard to validate so don't count them
|
||||
|
||||
}
|
||||
else { // Either not in the call set at all, is a monomorphic call, or was filtered out
|
||||
if( isTrueVariant ) { // ... but it is a true variant so this is a false negative call
|
||||
falseNegGlobal[curveIndex]++;
|
||||
} else { // ... and it is not a variant site so this is a true negative call
|
||||
|
|
|
|||
|
|
@ -55,18 +55,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
private final double[][] mu; // The means for the clusters
|
||||
private final double[][] sigma; // The variances for the clusters, sigma is really sigma^2
|
||||
private final double[] pCluster;
|
||||
private final boolean[] isHetCluster;
|
||||
//private final boolean[] isHetCluster;
|
||||
private final int[] numMaxClusterKnown;
|
||||
private final int[] numMaxClusterNovel;
|
||||
private final double[] clusterTITV;
|
||||
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
||||
private final int minVarInCluster;
|
||||
private final double knownAlphaFactor;
|
||||
|
||||
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
|
||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) {
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final double _knownAlphaFactor ) {
|
||||
super( _targetTITV );
|
||||
dataManager = _dataManager;
|
||||
numGaussians = ( _numGaussians % 2 == 0 ? _numGaussians : _numGaussians + 1 );
|
||||
|
|
@ -75,15 +76,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
mu = new double[numGaussians][];
|
||||
sigma = new double[numGaussians][];
|
||||
pCluster = new double[numGaussians];
|
||||
isHetCluster = null;
|
||||
//isHetCluster = null;
|
||||
numMaxClusterKnown = new int[numGaussians];
|
||||
numMaxClusterNovel = new int[numGaussians];
|
||||
clusterTITV = new double[numGaussians];
|
||||
clusterTruePositiveRate = new double[numGaussians];
|
||||
minVarInCluster = _minVarInCluster;
|
||||
knownAlphaFactor = _knownAlphaFactor;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final String clusterFileName ) {
|
||||
public VariantGaussianMixtureModel( final String clusterFileName, final double backOffGaussianFactor ) {
|
||||
super( 0.0 );
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
|
@ -108,24 +110,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
numMaxClusterNovel = null;
|
||||
clusterTITV = null;
|
||||
minVarInCluster = 0;
|
||||
knownAlphaFactor = 0.0;
|
||||
|
||||
//BUGBUG: move this parsing out of the constructor
|
||||
numGaussians = clusterLines.size();
|
||||
mu = new double[numGaussians][dataManager.numAnnotations];
|
||||
sigma = new double[numGaussians][dataManager.numAnnotations];
|
||||
pCluster = new double[numGaussians];
|
||||
isHetCluster = new boolean[numGaussians];
|
||||
//isHetCluster = new boolean[numGaussians];
|
||||
clusterTruePositiveRate = new double[numGaussians];
|
||||
|
||||
int kkk = 0;
|
||||
for( String line : clusterLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1;
|
||||
//isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1;
|
||||
pCluster[kkk] = Double.parseDouble(vals[2]);
|
||||
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[6]); //BUGBUG: #define these magic index numbers, very easy to make a mistake here
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = Double.parseDouble(vals[7+jjj]);
|
||||
sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * 3; //BUGBUG: *1.3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points
|
||||
sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * backOffGaussianFactor; //BUGBUG: *3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points
|
||||
}
|
||||
kkk++;
|
||||
}
|
||||
|
|
@ -156,7 +159,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
|
||||
// This block of code is used to cluster with novels + 1.5x knowns mixed together
|
||||
/*
|
||||
|
||||
VariantDatum[] data;
|
||||
|
||||
// Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random
|
||||
|
|
@ -183,12 +186,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants...");
|
||||
if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); }
|
||||
createClusters( data ); // Using a subset of the data
|
||||
createClusters( data, 0, numGaussians ); // Using a subset of the data
|
||||
System.out.println("Outputting cluster parameters...");
|
||||
printClusters( clusterFileName );
|
||||
//System.out.println("Applying clusters to all variants...");
|
||||
//applyClusters( dataManager.data, outputPrefix ); // Using all the data
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -243,6 +244,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
*/
|
||||
|
||||
/*
|
||||
// This block of code is to cluster het and hom calls separately, but mixing together knowns and novels
|
||||
final VariantDatum[] dataHet = new VariantDatum[Math.min(numHet,MAX_VARS)];
|
||||
final VariantDatum[] dataHom = new VariantDatum[Math.min(numHom,MAX_VARS)];
|
||||
|
|
@ -288,11 +290,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
createClusters( dataHom, numGaussians / 2, numGaussians );
|
||||
System.out.println("Outputting cluster parameters...");
|
||||
printClusters( clusterFileName );
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
public final void createClusters( final VariantDatum[] data, int startCluster, int stopCluster ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
|
|
@ -403,19 +405,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
clusterTITV[kkk] = probTi[kkk] / probTv[kkk];
|
||||
if( probKnown[kkk] > 2000.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument
|
||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk] );
|
||||
if( probKnown[kkk] > 600.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument
|
||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk], clusterTITV[kkk], knownAlphaFactor );
|
||||
} else {
|
||||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
// This cluster method doesn't make use of the differences between known and novel Ti/Tv ratios
|
||||
|
||||
/*
|
||||
|
||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
|
|
@ -507,7 +509,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] );
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
private void printClusters( final String clusterFileName ) {
|
||||
|
|
@ -568,9 +570,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
double sum = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
if( isHetCluster[kkk] == isHet ) {
|
||||
//if( isHetCluster[kkk] == isHet ) {
|
||||
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
|
||||
}
|
||||
//}
|
||||
}
|
||||
|
||||
return sum;
|
||||
|
|
@ -654,7 +656,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
/ sigma[kkk][jjj];
|
||||
}
|
||||
pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||
|
||||
|
||||
if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
pVarInCluster[kkk][iii] = MIN_PROB;
|
||||
}
|
||||
|
|
@ -677,30 +679,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
double sumProb = 0.0;
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
if( isHetCluster[kkk] == isHet ) {
|
||||
//if( isHetCluster[kkk] == isHet ) {
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) )
|
||||
/ sigma[kkk][jjj];
|
||||
}
|
||||
|
||||
//BUGBUG: pCluster[kkk] should be removed here?
|
||||
pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||
//pVarInCluster[kkk] = Math.exp( -0.5 * sum );
|
||||
// BUGBUG: reverting to old version that didn't have pCluster[kkk]* here, this meant that the overfitting parameters changed meanings
|
||||
//pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||
pVarInCluster[kkk] = Math.exp( -0.5 * sum );
|
||||
|
||||
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
|
||||
pVarInCluster[kkk] = MIN_PROB;
|
||||
}
|
||||
|
||||
sumProb += pVarInCluster[kkk];
|
||||
}
|
||||
//}
|
||||
}
|
||||
|
||||
if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem
|
||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||
if( isHetCluster[kkk] == isHet ) {
|
||||
//if( isHetCluster[kkk] == isHet ) {
|
||||
pVarInCluster[kkk] /= sumProb;
|
||||
}
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -828,4 +830,4 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -44,7 +44,8 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
|||
targetTITV = _targetTITV;
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromTITV( double titv ) {
|
||||
public final double calcTruePositiveRateFromTITV( final double _titv ) {
|
||||
double titv = _titv;
|
||||
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
|
||||
if( titv < 0.5 ) { titv = 0.5; }
|
||||
return ( (titv - 0.5) / (targetTITV - 0.5) );
|
||||
|
|
@ -52,9 +53,14 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt
|
|||
//return ( titv / targetTITV );
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, double novelTITV ) {
|
||||
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) {
|
||||
|
||||
final double tprTarget = calcTruePositiveRateFromTITV( overallTITV );
|
||||
double novelTITV = _novelTITV;
|
||||
if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); }
|
||||
if( novelTITV < 0.5 ) { novelTITV = 0.5; }
|
||||
return ( (novelTITV - 0.5) / (knownTITV - 0.5) );
|
||||
final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) );
|
||||
|
||||
return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -55,6 +55,8 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.1;
|
||||
@Argument(fullName="known_alpha_factor", shortName="kFactor", doc="Percentage of true positive rate that is due to difference between known and novel titv in a cluster as opposed to difference from target titv", required=false)
|
||||
private double KNOWN_ALPHA_FACTOR = 0.0;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
|
|
@ -66,11 +68,11 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
|
||||
private int NUM_GAUSSIANS = 100;
|
||||
private int NUM_GAUSSIANS = 32;
|
||||
@Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
|
||||
private int NUM_ITERATIONS = 7;
|
||||
private int NUM_ITERATIONS = 10;
|
||||
@Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
|
||||
private int MIN_VAR_IN_CLUSTER = 0;
|
||||
private int MIN_VAR_IN_CLUSTER = 2000;
|
||||
|
||||
//@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
|
||||
//private int NUM_KNN = 2000;
|
||||
|
|
@ -160,6 +162,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
|
||||
variantDatum.isHet = vc.getHetCount() > vc.getHomVarCount(); // BUGBUG: what to do here for multi sample calls?
|
||||
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
|
|
@ -197,7 +200,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
|||
VariantOptimizationModel theModel;
|
||||
switch (OPTIMIZATION_MODEL) {
|
||||
case GAUSSIAN_MIXTURE_MODEL:
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, KNOWN_ALPHA_FACTOR );
|
||||
break;
|
||||
//case K_NEAREST_NEIGHBORS:
|
||||
// theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );
|
||||
|
|
|
|||
Loading…
Reference in New Issue