From 642c969896906610897e167727daf5643ffaca58 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 2 Apr 2010 16:59:13 +0000 Subject: [PATCH] reverting optimizer changes git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3112 348d0f76-0448-11de-a6fe-93d51630548a --- .../ApplyVariantClustersWalker.java | 12 ++-- .../VariantConcordanceROCCurveWalker.java | 40 ++++++++---- .../VariantGaussianMixtureModel.java | 62 ++++++++++--------- .../VariantOptimizationModel.java | 12 +++- .../variantoptimizer/VariantOptimizer.java | 11 ++-- 5 files changed, 84 insertions(+), 53 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java index 1d9537b84..8cb6d7f49 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -53,16 +53,18 @@ public class ApplyVariantClustersWalker extends RodWalker 0.1 ) { variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR; } else { variantDatum.qual = recalQual; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java index d5fc80e8e..525a5fa30 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java @@ -70,6 +70,7 @@ public class VariantConcordanceROCCurveWalker extends RodWalker 1 ) { + multiSampleCalls = true; + System.out.println("Found multi sample calls."); + } else { + sampleName = samples[0]; + System.out.println("Found single sample calls."); + } } } } @@ -119,17 +129,21 @@ public class VariantConcordanceROCCurveWalker extends RodWalker(inputName, variantDatum) ); - } else { // Either not in the call set at all, is a monomorphic call, or was filtered out + } + else if ( vc != null && vc.isFiltered() && vc.getFilters().contains("HARD_TO_VALIDATE") ) { // hard to validate sites are hard to validate so don't count them + + } + else { // Either not in the call set at all, is a monomorphic call, or was filtered out if( isTrueVariant ) { // ... but it is a true variant so this is a false negative call falseNegGlobal[curveIndex]++; } else { // ... and it is not a variant site so this is a true negative call diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index 3ef776c45..67b147589 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -55,18 +55,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel private final double[][] mu; // The means for the clusters private final double[][] sigma; // The variances for the clusters, sigma is really sigma^2 private final double[] pCluster; - private final boolean[] isHetCluster; + //private final boolean[] isHetCluster; private final int[] numMaxClusterKnown; private final int[] numMaxClusterNovel; private final double[] clusterTITV; private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio private final int minVarInCluster; + private final double knownAlphaFactor; private static final double INFINITE_ANNOTATION_VALUE = 10000.0; private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); - public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) { + public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final double _knownAlphaFactor ) { super( _targetTITV ); dataManager = _dataManager; numGaussians = ( _numGaussians % 2 == 0 ? _numGaussians : _numGaussians + 1 ); @@ -75,15 +76,16 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel mu = new double[numGaussians][]; sigma = new double[numGaussians][]; pCluster = new double[numGaussians]; - isHetCluster = null; + //isHetCluster = null; numMaxClusterKnown = new int[numGaussians]; numMaxClusterNovel = new int[numGaussians]; clusterTITV = new double[numGaussians]; clusterTruePositiveRate = new double[numGaussians]; minVarInCluster = _minVarInCluster; + knownAlphaFactor = _knownAlphaFactor; } - public VariantGaussianMixtureModel( final String clusterFileName ) { + public VariantGaussianMixtureModel( final String clusterFileName, final double backOffGaussianFactor ) { super( 0.0 ); final ExpandingArrayList annotationLines = new ExpandingArrayList(); final ExpandingArrayList clusterLines = new ExpandingArrayList(); @@ -108,24 +110,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel numMaxClusterNovel = null; clusterTITV = null; minVarInCluster = 0; + knownAlphaFactor = 0.0; //BUGBUG: move this parsing out of the constructor numGaussians = clusterLines.size(); mu = new double[numGaussians][dataManager.numAnnotations]; sigma = new double[numGaussians][dataManager.numAnnotations]; pCluster = new double[numGaussians]; - isHetCluster = new boolean[numGaussians]; + //isHetCluster = new boolean[numGaussians]; clusterTruePositiveRate = new double[numGaussians]; int kkk = 0; for( String line : clusterLines ) { final String[] vals = line.split(","); - isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1; + //isHetCluster[kkk] = Integer.parseInt(vals[1]) == 1; pCluster[kkk] = Double.parseDouble(vals[2]); clusterTruePositiveRate[kkk] = Double.parseDouble(vals[6]); //BUGBUG: #define these magic index numbers, very easy to make a mistake here for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { mu[kkk][jjj] = Double.parseDouble(vals[7+jjj]); - sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * 3; //BUGBUG: *1.3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points + sigma[kkk][jjj] = Double.parseDouble(vals[7+dataManager.numAnnotations+jjj]) * backOffGaussianFactor; //BUGBUG: *3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points } kkk++; } @@ -156,7 +159,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } // This block of code is used to cluster with novels + 1.5x knowns mixed together - /* + VariantDatum[] data; // Grab a set of data that is all of the novel variants plus 1.5x as many known variants drawn at random @@ -183,12 +186,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel System.out.println("Clustering with " + numNovel + " novel variants and " + (data.length - numNovel) + " known variants..."); if( data.length == dataManager.numVariants ) { System.out.println(" (used all variants since 2.5*numNovel is so large compared to the full set) "); } - createClusters( data ); // Using a subset of the data + createClusters( data, 0, numGaussians ); // Using a subset of the data System.out.println("Outputting cluster parameters..."); printClusters( clusterFileName ); - //System.out.println("Applying clusters to all variants..."); - //applyClusters( dataManager.data, outputPrefix ); // Using all the data - */ + @@ -243,6 +244,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel */ + /* // This block of code is to cluster het and hom calls separately, but mixing together knowns and novels final VariantDatum[] dataHet = new VariantDatum[Math.min(numHet,MAX_VARS)]; final VariantDatum[] dataHom = new VariantDatum[Math.min(numHom,MAX_VARS)]; @@ -288,11 +290,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel createClusters( dataHom, numGaussians / 2, numGaussians ); System.out.println("Outputting cluster parameters..."); printClusters( clusterFileName ); - +*/ } - +/* public final void createClusters( final VariantDatum[] data, int startCluster, int stopCluster ) { final int numVariants = data.length; @@ -403,19 +405,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { clusterTITV[kkk] = probTi[kkk] / probTv[kkk]; - if( probKnown[kkk] > 2000.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument - clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk] ); + if( probKnown[kkk] > 600.0 ) { // BUGBUG: make this a command line argument, parameterize performance based on this important argument + clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk], clusterTITV[kkk], knownAlphaFactor ); } else { clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] ); } } } - +*/ // This cluster method doesn't make use of the differences between known and novel Ti/Tv ratios - /* + public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster ) { final int numVariants = data.length; @@ -507,7 +509,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] ); } } - */ + private void printClusters( final String clusterFileName ) { @@ -568,9 +570,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel double sum = 0.0; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - if( isHetCluster[kkk] == isHet ) { + //if( isHetCluster[kkk] == isHet ) { sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk]; - } + //} } return sum; @@ -654,7 +656,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel / sigma[kkk][jjj]; } pVarInCluster[kkk][iii] = pCluster[kkk] * Math.exp( -0.5 * sum ); - + if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem pVarInCluster[kkk][iii] = MIN_PROB; } @@ -677,30 +679,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel double sumProb = 0.0; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - if( isHetCluster[kkk] == isHet ) { + //if( isHetCluster[kkk] == isHet ) { double sum = 0.0; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) ) / sigma[kkk][jjj]; } - //BUGBUG: pCluster[kkk] should be removed here? - pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum ); - //pVarInCluster[kkk] = Math.exp( -0.5 * sum ); + // BUGBUG: reverting to old version that didn't have pCluster[kkk]* here, this meant that the overfitting parameters changed meanings + //pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum ); + pVarInCluster[kkk] = Math.exp( -0.5 * sum ); if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem pVarInCluster[kkk] = MIN_PROB; } sumProb += pVarInCluster[kkk]; - } + //} } if( sumProb > MIN_SUM_PROB ) { // Very small numbers are a very big problem for( int kkk = 0; kkk < numGaussians; kkk++ ) { - if( isHetCluster[kkk] == isHet ) { + //if( isHetCluster[kkk] == isHet ) { pVarInCluster[kkk] /= sumProb; - } + //} } } } @@ -828,4 +830,4 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } -} +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java index 71b0027d5..fcf92c97c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java @@ -44,7 +44,8 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt targetTITV = _targetTITV; } - public final double calcTruePositiveRateFromTITV( double titv ) { + public final double calcTruePositiveRateFromTITV( final double _titv ) { + double titv = _titv; if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); } if( titv < 0.5 ) { titv = 0.5; } return ( (titv - 0.5) / (targetTITV - 0.5) ); @@ -52,9 +53,14 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt //return ( titv / targetTITV ); } - public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, double novelTITV ) { + public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) { + + final double tprTarget = calcTruePositiveRateFromTITV( overallTITV ); + double novelTITV = _novelTITV; if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); } if( novelTITV < 0.5 ) { novelTITV = 0.5; } - return ( (novelTITV - 0.5) / (knownTITV - 0.5) ); + final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) ); + + return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget ); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index 45feecba9..34b0aaa5a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -55,6 +55,8 @@ public class VariantOptimizer extends RodWalker ///////////////////////////// @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true) private double TARGET_TITV = 2.1; + @Argument(fullName="known_alpha_factor", shortName="kFactor", doc="Percentage of true positive rate that is due to difference between known and novel titv in a cluster as opposed to difference from target titv", required=false) + private double KNOWN_ALPHA_FACTOR = 0.0; @Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) private boolean IGNORE_ALL_INPUT_FILTERS = false; @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) @@ -66,11 +68,11 @@ public class VariantOptimizer extends RodWalker @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) private String CLUSTER_FILENAME = "optimizer.cluster"; @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false) - private int NUM_GAUSSIANS = 100; + private int NUM_GAUSSIANS = 32; @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false) - private int NUM_ITERATIONS = 7; + private int NUM_ITERATIONS = 10; @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false) - private int MIN_VAR_IN_CLUSTER = 0; + private int MIN_VAR_IN_CLUSTER = 2000; //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) //private int NUM_KNN = 2000; @@ -160,6 +162,7 @@ public class VariantOptimizer extends RodWalker variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; variantDatum.isKnown = !vc.getAttribute("ID").equals("."); variantDatum.isHet = vc.getHetCount() > vc.getHomVarCount(); // BUGBUG: what to do here for multi sample calls? + mapList.add( variantDatum ); } } @@ -197,7 +200,7 @@ public class VariantOptimizer extends RodWalker VariantOptimizationModel theModel; switch (OPTIMIZATION_MODEL) { case GAUSSIAN_MIXTURE_MODEL: - theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER ); + theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, KNOWN_ALPHA_FACTOR ); break; //case K_NEAREST_NEIGHBORS: // theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN );