diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java index e96e9a5b4..cc9374c0b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java @@ -71,7 +71,6 @@ public class DbSNPHelper { return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del"); } - public static boolean isHapmap(DbSNPFeature feature) { return feature.getValidationStatus().contains("by-hapmap"); } @@ -80,6 +79,14 @@ public class DbSNPHelper { return feature.getValidationStatus().contains("by-2hit-2allele"); } + public static boolean is1000genomes(DbSNPFeature feature) { + return feature.getValidationStatus().contains("by-1000genomes"); + } + + public static boolean isMQ1(DbSNPFeature feature) { + return feature.getWeight() == 1; + } + /** * gets the alternate alleles. This method should return all the alleles present at the location, * NOT including the reference base. This is returned as a string list with no guarantee ordering diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java index cf18d54fe..b0e822a2d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; +import org.broad.tribble.dbsnp.DbSNPFeature; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -60,7 +61,7 @@ public class ApplyVariantClustersWalker extends RodWalker ignoreInputFilterSet = null; private final ArrayList ALLOWED_FORMAT_FIELDS = new ArrayList(); - private boolean usingDBSNP = false; - //--------------------------------------------------------------------------------------------------------------- // @@ -130,9 +133,9 @@ public class ApplyVariantClustersWalker extends RodWalker samples = new TreeSet(); final List dataSources = this.getToolkit().getRodDataSources(); - for ( final ReferenceOrderedDataSource source : dataSources ) { + for( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); - if ( rod.getType().equals(VCFCodec.class) ) { + if( rod.getType().equals(VCFCodec.class) ) { final VCFReader reader = new VCFReader(rod.getFile()); final Set vcfSamples = reader.getHeader().getGenotypeSamples(); samples.addAll(vcfSamples); @@ -142,12 +145,17 @@ public class ApplyVariantClustersWalker extends RodWalker 0.1 ) { // only use the known prior if the value is specified (meaning not equal to zero) - variantDatum.qual = 0.5 * recalQual + 0.5 * KNOWN_VAR_QUAL_PRIOR; - } else { - variantDatum.qual = recalQual; - } - mapList.add( variantDatum ); + final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + variantDatum.isKnown = dbsnp != null; + variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes + + final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount ); + final double knownPrior = ( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) ); + final double pTrue = theModel.evaluateVariant( vc ) * acPrior * knownPrior; + + variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily + mapList.add( variantDatum ); vcf.addInfoField("OQ", ((Double)vc.getPhredScaledQual()).toString() ); vcf.setQual( variantDatum.qual ); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java index 647ddea52..9ffcd169e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java @@ -36,4 +36,6 @@ public class VariantDatum { public boolean isTransition; public boolean isKnown; public double qual; + public double weight; + public int alleleCount; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index ad01d3fc2..cde0fb526 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -58,22 +58,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel private final double MIN_SIGMA = 1E-5; private final double MIN_DETERMINANT = 1E-5; - private final double[][] mu; // The means for the clusters - private final Matrix[] sigma; // The variances for the clusters, sigma is really sigma^2 + private final double[][] mu; // The means for each cluster + private final Matrix[] sigma; // The covariance matrix for each cluster private final Matrix[] sigmaInverse; private final double[] pCluster; private final double[] determinant; - private final double[] clusterTITV; - private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio + private final double[] alleleCountFactorArray; private final int minVarInCluster; - public final boolean isUsingTiTvModel; - private static final double INFINITE_ANNOTATION_VALUE = 6000.0; private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); + private static final Pattern ALLELECOUNT_PATTERN = Pattern.compile("^@!ALLELECOUNT.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); - public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final double _targetTITV, final int _numGaussians, final int _numIterations, final int _minVarInCluster ) { - super( _targetTITV ); + public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _numGaussians, final int _numIterations, final int _minVarInCluster, final int maxAC ) { dataManager = _dataManager; numGaussians = _numGaussians; numIterations = _numIterations; @@ -82,22 +79,23 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel sigma = new Matrix[numGaussians]; determinant = new double[numGaussians]; pCluster = new double[numGaussians]; - clusterTITV = new double[numGaussians]; - clusterTruePositiveRate = new double[numGaussians]; + alleleCountFactorArray = new double[maxAC + 1]; minVarInCluster = _minVarInCluster; - sigmaInverse = null; - isUsingTiTvModel = false; // this field isn't used during VariantOptimizerWalker + sigmaInverse = null; // This field isn't used during VariantOptimizer pass } public VariantGaussianMixtureModel( final double _targetTITV, final String clusterFileName, final double backOffGaussianFactor ) { super( _targetTITV ); final ExpandingArrayList annotationLines = new ExpandingArrayList(); + final ExpandingArrayList alleleCountLines = new ExpandingArrayList(); final ExpandingArrayList clusterLines = new ExpandingArrayList(); try { - for ( String line : new XReadLines(new File( clusterFileName )) ) { + for ( final String line : new XReadLines(new File( clusterFileName )) ) { if( ANNOTATION_PATTERN.matcher(line).matches() ) { annotationLines.add(line); + } else if( ALLELECOUNT_PATTERN.matcher(line).matches() ) { + alleleCountLines.add(line); } else if( CLUSTER_PATTERN.matcher(line).matches() ) { clusterLines.add(line); } else { @@ -111,7 +109,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel dataManager = new VariantDataManager( annotationLines ); // Several of the clustering parameters aren't used the second time around in ApplyVariantClusters numIterations = 0; - clusterTITV = null; minVarInCluster = 0; // BUGBUG: move this parsing out of the constructor @@ -122,19 +119,21 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel sigmaInverse = new Matrix[numGaussians]; pCluster = new double[numGaussians]; determinant = new double[numGaussians]; - clusterTruePositiveRate = new double[numGaussians]; - boolean _isUsingTiTvModel = false; + + alleleCountFactorArray = new double[alleleCountLines.size() + 1]; + for( final String line : alleleCountLines ) { + final String[] vals = line.split(","); + alleleCountFactorArray[Integer.parseInt(vals[1])] = Double.parseDouble(vals[2]); + } int kkk = 0; - for( String line : clusterLines ) { + for( final String line : clusterLines ) { final String[] vals = line.split(","); - pCluster[kkk] = Double.parseDouble(vals[1]); - clusterTruePositiveRate[kkk] = Double.parseDouble(vals[3]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here - if( clusterTruePositiveRate[kkk] != 1.0 ) { _isUsingTiTvModel = true; } + pCluster[kkk] = Double.parseDouble(vals[1]); // BUGBUG: #define these magic index numbers, very easy to make a mistake here for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { - mu[kkk][jjj] = Double.parseDouble(vals[4+jjj]); + mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]); for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) { - sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[4+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor; // BUGBUG: *3, suggestion by Nick to prevent GMM from over fitting and producing low likelihoods for most points + sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[2+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor; } } @@ -143,93 +142,48 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel determinant[kkk] = sigma[kkk].det(); kkk++; } - isUsingTiTvModel = _isUsingTiTvModel; logger.info("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys); } public final void run( final String clusterFileName ) { - final int MAX_KNOWN_VARS = 5000000; // BUGBUG: make this a command line argument - final int MAX_NOVEL_VARS = 5000000; // BUGBUG: make this a command line argument - final double knownNovelMixture = 1.5; // BUGBUG: make this a command line argument + // Initialize the Allele Count prior + generateAlleleCountPrior(); - // Create the subset of the data to cluster with - int numNovel = 0; - int numKnown = 0; + // Simply cluster with all the variants. The knowns have been given more weight than the novels + logger.info("Clustering with " + dataManager.data.length + " variants."); + createClusters( dataManager.data, 0, numGaussians, clusterFileName, false ); + } + + private void generateAlleleCountPrior() { + + final double[] acExpectation = new double[alleleCountFactorArray.length]; + final double[] acActual = new double[alleleCountFactorArray.length]; + final int[] alleleCount = new int[alleleCountFactorArray.length]; + + double sumExpectation = 0.0; + for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { + acExpectation[iii] = 1.0 / ((double) iii); + sumExpectation += acExpectation[iii]; + } + for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { + acExpectation[iii] /= sumExpectation; // Turn acExpectation into a probability distribution + alleleCount[iii] = 0; + } for( final VariantDatum datum : dataManager.data ) { - if( datum.isKnown ) { - numKnown++; - } else { - numNovel++; - } + alleleCount[datum.alleleCount]++; } - - final int numNovelCluster = Math.min( numNovel, MAX_NOVEL_VARS ); - final int numKnownCluster = Math.min( numKnown, MAX_KNOWN_VARS ); - final int numKnownTogether = Math.min( numKnownCluster, (int) Math.floor(knownNovelMixture * numNovelCluster) ); - - final VariantDatum[] dataTogether = new VariantDatum[numNovelCluster + numKnownTogether]; - final VariantDatum[] dataKnown = new VariantDatum[numKnownCluster]; - - // Create the dataTogether array, which is all the novels and 1.5x as many knowns, downsampled if there are too many - int iii = 0; - if( numNovelCluster == numNovel ) { - for( final VariantDatum datum : dataManager.data ) { - if( !datum.isKnown ) { - dataTogether[iii++] = datum; - } - } - } else { - logger.info("Capped at " + MAX_NOVEL_VARS + " novel variants."); - while( iii < numNovelCluster ) { - final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; - if( !datum.isKnown ) { - dataTogether[iii++] = datum; - } - } + for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { + acActual[iii] = ((double)alleleCount[iii]) / ((double)dataManager.data.length); // Turn acActual into a probability distribution } - if( numKnownTogether == numKnown ) { - for( final VariantDatum datum : dataManager.data ) { - if( datum.isKnown ) { - dataTogether[iii++] = datum; - } - } - } else { - while( iii < numNovelCluster + numKnownTogether ) { - final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; - if( datum.isKnown ) { - dataTogether[iii++] = datum; - } - } + for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { + alleleCountFactorArray[iii] = acExpectation[iii] / acActual[iii]; // Prior is (expected / observed) } + } - // Create the dataKnown array, which is simply all the known vars or downsampled if there are too many - iii = 0; - if( numKnownCluster == numKnown ) { - for( final VariantDatum datum : dataManager.data ) { - if( datum.isKnown ) { - dataKnown[iii++] = datum; - } - } - } else { - logger.info("Capped at " + MAX_KNOWN_VARS + " known variants."); - while( iii < numKnownCluster ) { - final VariantDatum datum = dataManager.data[rand.nextInt(dataManager.numVariants)]; - if( datum.isKnown ) { - dataKnown[iii++] = datum; - } - } - } - - final boolean useTITV = true; - logger.info("First, cluster with novels and knowns together to use ti/tv based models:"); - logger.info("Clustering with " + numNovelCluster + " novel variants and " + numKnownTogether + " known variants."); - createClusters( dataTogether, 0, numGaussians, clusterFileName, useTITV ); - - logger.info("Finally, cluster with only knowns to use ti/tv-less models:"); - logger.info("Clustering with " + numKnownCluster + " known variants."); - createClusters( dataKnown, 0, numGaussians, clusterFileName, !useTITV ); + public final double getAlleleCountPrior( final int alleleCount ) { + return alleleCountFactorArray[alleleCount]; } public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) { @@ -277,108 +231,28 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel maximizeGaussians( data, pVarInCluster, startCluster, stopCluster ); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Estimate each cluster's p(true) and output cluster parameters + // Output cluster parameters at each iteration ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - outputGaussians( data, pVarInCluster, ttt+1, startCluster, stopCluster, clusterFileName, useTITV ); + printClusterParameters( clusterFileName + "." + (ttt+1) ); logger.info("Finished iteration " + (ttt+1) ); } - } - private void outputGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int iterationNumber, - final int startCluster, final int stopCluster, final String clusterFileName, final boolean useTITV ) { - - if( !useTITV ) { - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - clusterTITV[kkk] = 0.0; - clusterTruePositiveRate[kkk] = 1.0; - } - printClusterParameters( clusterFileName + ".WithoutTiTv." + iterationNumber ); - return; - } - - final int numVariants = data.length; - - final double[] probTi = new double[numGaussians]; - final double[] probTv = new double[numGaussians]; - final double[] probKnown = new double[numGaussians]; - final double[] probNovel = new double[numGaussians]; - final double[] probKnownTi = new double[numGaussians]; - final double[] probKnownTv = new double[numGaussians]; - final double[] probNovelTi = new double[numGaussians]; - final double[] probNovelTv = new double[numGaussians]; - - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - probTi[kkk] = 0.0; - probTv[kkk] = 0.0; - probKnown[kkk] = 0.0; - probNovel[kkk] = 0.0; - probKnownTi[kkk] = 0.0; - probKnownTv[kkk] = 0.0; - probNovelTi[kkk] = 0.0; - probNovelTv[kkk] = 0.0; - } - - // Use the cluster's probabilistic Ti/Tv ratio as the indication of the cluster's true positive rate - for( int iii = 0; iii < numVariants; iii++ ) { - final boolean isTransition = data[iii].isTransition; - final boolean isKnown = data[iii].isKnown; - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - final double prob = pVarInCluster[kkk][iii]; - if( isKnown ) { // known - probKnown[kkk] += prob; - if( isTransition ) { // transition - probKnownTi[kkk] += prob; - probTi[kkk] += prob; - } else { // transversion - probKnownTv[kkk] += prob; - probTv[kkk] += prob; - } - } else { //novel - probNovel[kkk] += prob; - if( isTransition ) { // transition - probNovelTi[kkk] += prob; - probTi[kkk] += prob; - } else { // transversion - probNovelTv[kkk] += prob; - probTv[kkk] += prob; - } - } - } - } - - for( int ttt = 0; ttt < 3; ttt++ ) { - double knownAlphaFactor = 0.0; - if( ttt == 0 ) { - knownAlphaFactor = 0.0; - } else if( ttt == 1 ) { - knownAlphaFactor = 1.0; - } else if( ttt == 2 ) { - knownAlphaFactor = 0.5; - } - for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { - clusterTITV[kkk] = probTi[kkk] / probTv[kkk]; - if( probKnown[kkk] > 500.0 && probNovel[kkk] > 500.0 ) { - clusterTruePositiveRate[kkk] = calcTruePositiveRateFromKnownTITV( probKnownTi[kkk] / probKnownTv[kkk], probNovelTi[kkk] / probNovelTv[kkk], clusterTITV[kkk], knownAlphaFactor ); - } else { - clusterTruePositiveRate[kkk] = calcTruePositiveRateFromTITV( clusterTITV[kkk] ); - } - } - - if( ttt == 0 ) { - printClusterParameters( clusterFileName + ".TargetTiTv." + iterationNumber ); - } else if( ttt == 1 ) { - printClusterParameters( clusterFileName + ".KnownTiTv." + iterationNumber ); - } else if( ttt == 2 ) { - printClusterParameters( clusterFileName + ".BlendedTiTv." + iterationNumber ); - } - } + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Output the final cluster parameters + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + printClusterParameters( clusterFileName ); } private void printClusterParameters( final String clusterFileName ) { try { final PrintStream outputFile = new PrintStream( clusterFileName ); dataManager.printClusterFileHeader( outputFile ); + for( int iii = 1; iii < alleleCountFactorArray.length; iii++ ) { + outputFile.print("@!ALLELECOUNT,"); + outputFile.println(iii + "," + alleleCountFactorArray[iii]); + } + final int numAnnotations = mu[0].length; final int numVariants = dataManager.numVariants; for( int kkk = 0; kkk < numGaussians; kkk++ ) { @@ -386,8 +260,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel final double sigmaVals[][] = sigma[kkk].getArray(); outputFile.print("@!CLUSTER,"); outputFile.print(pCluster[kkk] + ","); - outputFile.print(clusterTITV[kkk] + ","); - outputFile.print(clusterTruePositiveRate[kkk] + ","); for(int jjj = 0; jjj < numAnnotations; jjj++ ) { outputFile.print(mu[kkk][jjj] + ","); } @@ -406,19 +278,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } public static double decodeAnnotation( final String annotationKey, final VariantContext vc ) { - double value = 0.0; - if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) { - value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance - } else if( annotationKey.equals("QUAL") ) { + double value; + //if( annotationKey.equals("AB") && !vc.getAttributes().containsKey(annotationKey) ) { + // value = (0.5 - 0.005) + (0.01 * rand.nextDouble()); // HomVar calls don't have an allele balance + //} + if( annotationKey.equals("QUAL") ) { value = vc.getPhredScaledQual(); } else { try { - value = Double.parseDouble( (String)vc.getAttribute( annotationKey, "0.0" ) ); - if( Double.isInfinite(value) ) { - value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE; - } + value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) ); } catch( NumberFormatException e ) { - // do nothing, default value is 0.0 + throw new StingException("No double value detected for annotation = " + annotationKey + + " in variant at " + vc.getLocation() + ", reported annotation value = " + vc.getAttribute( annotationKey ) ); } } return value; @@ -437,7 +308,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel double sum = 0.0; for( int kkk = 0; kkk < numGaussians; kkk++ ) { - sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk]; + sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk]; } return sum; } @@ -496,7 +367,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel // BUGBUG: next output the actual cluster on top by integrating out every other annotation } - public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) { final int numVariants = data.length; @@ -669,8 +539,8 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel throw new StingException("Numerical Instability! probability distribution returns > 1.0. Try running with fewer clusters and then with better behaved annotation values."); } - if( pVarInCluster[kkk][iii] < MIN_PROB) { // Very small numbers are a very big problem - pVarInCluster[kkk][iii] = MIN_PROB;// + MIN_PROB * rand.nextDouble(); + if( pVarInCluster[kkk][iii] < MIN_PROB ) { // Very small numbers are a very big problem + pVarInCluster[kkk][iii] = MIN_PROB; // + MIN_PROB * rand.nextDouble(); } sumProb += pVarInCluster[kkk][iii]; @@ -678,11 +548,11 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { pVarInCluster[kkk][iii] /= sumProb; + pVarInCluster[kkk][iii] *= data[iii].weight; } - } - logger.info("Explained likelihood = " + String.format("%.5f",likelihood / ((double) data.length))); + logger.info("Explained likelihood = " + String.format("%.5f",likelihood / data.length)); } @@ -825,7 +695,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } */ - + /* // Replace extremely small clusters with another random draw from the dataset for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { if( pCluster[kkk] < 0.0005 * (1.0 / ((double) (stopCluster-startCluster))) || @@ -855,5 +725,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel for( int kkk = startCluster; kkk < stopCluster; kkk++ ) { pCluster[kkk] /= sumPK; } + */ } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java index fcf92c97c..f1666473f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizationModel.java @@ -40,6 +40,10 @@ public abstract class VariantOptimizationModel implements VariantOptimizationInt protected final double targetTITV; + public VariantOptimizationModel() { + targetTITV = 0.0; + } + public VariantOptimizationModel( final double _targetTITV ) { targetTITV = _targetTITV; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index 1ec283a8a..027a7d380 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; +import org.broad.tribble.dbsnp.DbSNPFeature; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; @@ -55,8 +56,6 @@ public class VariantOptimizer extends RodWalker ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.1 for whole genome experiments)", required=true) - private double TARGET_TITV = 2.1; @Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) private boolean IGNORE_ALL_INPUT_FILTERS = false; @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false) @@ -65,17 +64,24 @@ public class VariantOptimizer extends RodWalker private String[] USE_ANNOTATIONS = null; @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) private String CLUSTER_FILENAME = "optimizer.cluster"; - @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false) - private int NUM_GAUSSIANS = 1; - @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false) + @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used when clustering", required=false) + private int NUM_GAUSSIANS = 6; + @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed when clustering", required=false) private int NUM_ITERATIONS = 10; @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false) - private int MIN_VAR_IN_CLUSTER = 1000; + private int MIN_VAR_IN_CLUSTER = 0; @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) private String PATH_TO_RESOURCES = "R/"; - + @Argument(fullName="weightKnowns", shortName="weightKnowns", doc="The weight for known variants during clustering", required=false) + private double WEIGHT_KNOWNS = 8.0; + @Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for known HapMap variants during clustering", required=false) + private double WEIGHT_HAPMAP = 120.0; + @Argument(fullName="weight1000Genomes", shortName="weight1000Genomes", doc="The weight for known 1000 Genomes Project variants during clustering", required=false) + private double WEIGHT_1000GENOMES = 12.0; + @Argument(fullName="weightMQ1", shortName="weightMQ1", doc="The weight for MQ1 dbSNP variants during clustering", required=false) + private double WEIGHT_MQ1 = 10.0; //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) //private int NUM_KNN = 2000; @@ -86,9 +92,8 @@ public class VariantOptimizer extends RodWalker // Private Member Variables ///////////////////////////// private ExpandingArrayList annotationKeys; - private static final double INFINITE_ANNOTATION_VALUE = 10000.0; private Set ignoreInputFilterSet = null; - private boolean usingDBSNP = false; + private int maxAC = 0; //--------------------------------------------------------------------------------------------------------------- // @@ -103,14 +108,18 @@ public class VariantOptimizer extends RodWalker ignoreInputFilterSet = new TreeSet(Arrays.asList(IGNORE_INPUT_FILTERS)); } - // todo -- shouldn't be caching these values -- reduces code flexibility + boolean foundDBSNP = false; final List dataSources = this.getToolkit().getRodDataSources(); for( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - usingDBSNP = true; + foundDBSNP = true; } } + + if(!foundDBSNP) { + throw new StingException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites."); + } } //--------------------------------------------------------------------------------------------------------------- @@ -141,19 +150,23 @@ public class VariantOptimizer extends RodWalker final VariantDatum variantDatum = new VariantDatum(); variantDatum.annotations = annotationValues; variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; - - // todo this is a bit ugly logic -- why not just look for DBSNP? - boolean isKnown = !vc.getAttribute("ID").equals("."); - if(usingDBSNP) { - isKnown = false; - for( final VariantContext dbsnpVC : tracker.getVariantContexts(ref, DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) { - if(dbsnpVC != null && dbsnpVC.isSNP()) { - isKnown = true; - } - } + variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes + if( variantDatum.alleleCount > maxAC ) { + maxAC = variantDatum.alleleCount; } - variantDatum.isKnown = isKnown; + variantDatum.isKnown = false; + variantDatum.weight = 1.0; + + final DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + if( dbsnp != null ) { + variantDatum.isKnown = true; + variantDatum.weight = WEIGHT_KNOWNS; + if( DbSNPHelper.isHapmap( dbsnp ) ) { variantDatum.weight = WEIGHT_HAPMAP; } + else if( DbSNPHelper.is1000genomes( dbsnp ) ) { variantDatum.weight = WEIGHT_1000GENOMES; } + else if( DbSNPHelper.isMQ1( dbsnp ) ) { variantDatum.weight = WEIGHT_MQ1; } + } + mapList.add( variantDatum ); } } @@ -191,7 +204,7 @@ public class VariantOptimizer extends RodWalker VariantGaussianMixtureModel theModel; switch (OPTIMIZATION_MODEL) { case GAUSSIAN_MIXTURE_MODEL: - theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER ); + theModel = new VariantGaussianMixtureModel( dataManager, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER, maxAC ); break; //case K_NEAREST_NEIGHBORS: // theModel = new VariantNearestNeighborsModel( dataManager, TARGET_TITV, NUM_KNN ); @@ -201,10 +214,8 @@ public class VariantOptimizer extends RodWalker } theModel.run( CLUSTER_FILENAME ); - theModel.outputClusterReports( CLUSTER_FILENAME ); - for( final String annotation : annotationKeys ) { // Execute Rscript command to plot the optimization curve // Print out the command line to make it clear to the user what is being executed and how one might modify it