diff --git a/R/plot_OptimizationCurve.R b/R/plot_OptimizationCurve.R index 23705b79c..2618b6abd 100755 --- a/R/plot_OptimizationCurve.R +++ b/R/plot_OptimizationCurve.R @@ -11,13 +11,14 @@ data = read.table(input,sep=",",head=T) maxVars = max(data$numKnown, data$numNovel) maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV) minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV) +maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0]) outfile = paste(input, ".optimizationCurve.pdf", sep="") pdf(outfile, height=7, width=8) par(mar=c(4,4,1,4),cex=1.3) -plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with p(true) > X",ylab="",ylim=c(minTITV,maxTITV),col="Blue",pch=20) +plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20) points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20) abline(h=targetTITV,lty=3,col="Blue") axis(side=2,col="DarkBlue") @@ -25,7 +26,7 @@ axis(side=1) mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4) legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7) par(new=T) -plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),col="Green",pch=20) +plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20) points(data$pCut, data$numNovel,col="DarkGreen",pch=20) axis(side=4,col="DarkGreen") mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java index 6b474a923..46e1aeb9c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -3,10 +3,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.gatk.refdata.RodVCF; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.ExpandingArrayList; -import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.vcf.*; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.TreeSet; /* * Copyright (c) 2010 The Broad Institute @@ -47,8 +56,8 @@ public class ApplyVariantClustersWalker extends RodWalker annotationKeys = new ExpandingArrayList(); + private VariantGaussianMixtureModel theModel = null; + private VCFWriter vcfWriter; //--------------------------------------------------------------------------------------------------------------- // @@ -78,7 +88,6 @@ public class ApplyVariantClustersWalker extends RodWalker hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score")); + hInfo.add(new VCFHeaderLine("source", "VariantOptimizer")); + vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") ); + TreeSet samples = new TreeSet(); + SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap, String>()); + VCFHeader vcfHeader = new VCFHeader(hInfo, samples); + vcfWriter.writeHeader(vcfHeader); } //--------------------------------------------------------------------------------------------------------------- @@ -99,6 +120,35 @@ public class ApplyVariantClustersWalker extends RodWalker map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { final ExpandingArrayList mapList = new ExpandingArrayList(); + + if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers + return mapList; + } + + + for( ReferenceOrderedDatum rod : tracker.getAllRods() ) { + if( rod != null && rod instanceof RodVCF ) { + final RodVCF rodVCF = ((RodVCF) rod); + if( rodVCF.isSNP() && (IGNORE_INPUT_FILTERS || !rodVCF.isFiltered()) ) { + final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual() ); + final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) ); + rodVCF.mCurrentRecord.addInfoField("OQ", ((Double)rodVCF.getQual()).toString() ); + rodVCF.mCurrentRecord.setQual( recalQual ); + vcfWriter.addRecord( rodVCF.mCurrentRecord ); + + VariantDatum variantDatum = new VariantDatum(); + variantDatum.isTransition = BaseUtils.isTransition((byte)rodVCF.getAlternativeBaseForSNP(), (byte)rodVCF.getReferenceForSNP()); //vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; + variantDatum.isKnown = !rodVCF.isNovel(); //!vc.getAttribute("ID").equals("."); + variantDatum.qual = recalQual; + mapList.add( variantDatum ); + + } else { // not a SNP or is filtered so just dump it out to the VCF file + vcfWriter.addRecord( rodVCF.mCurrentRecord ); + } + } + + } + return mapList; } @@ -119,7 +169,22 @@ public class ApplyVariantClustersWalker extends RodWalker reduceSum ) { - final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys ); + final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys ); + reduceSum.clear(); // Don't need this ever again, clean up some memory + + theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS ); + + // Execute Rscript command to plot the optimization curve + // Print out the command line to make it clear to the user what is being executed and how one might modify it + final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV; + System.out.println( rScriptCommandLine ); + + // Execute the RScript command to plot the table of truth values + try { + Runtime.getRuntime().exec( rScriptCommandLine ); + } catch ( IOException e ) { + throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine ); + } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java index 8c0516bfe..07e6ba164 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDataManager.java @@ -4,8 +4,6 @@ import org.broadinstitute.sting.utils.ExpandingArrayList; import org.broadinstitute.sting.utils.StingException; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.regex.Pattern; /* * Copyright (c) 2010 The Broad Institute @@ -43,9 +41,9 @@ public class VariantDataManager { public final int numVariants; public final int numAnnotations; public final double[] meanVector; - public final double[] varianceVector; + public final double[] varianceVector; // This is really the standard deviation public boolean isNormalized; - private final ExpandingArrayList annotationKeys; + public final ExpandingArrayList annotationKeys; public VariantDataManager( final ExpandingArrayList dataList, final ExpandingArrayList _annotationKeys ) { numVariants = dataList.size(); @@ -53,7 +51,7 @@ public class VariantDataManager { if( numVariants <= 0 ) { throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" ); } - numAnnotations = data[0].annotations.length; + numAnnotations = _annotationKeys.size() + 1; // +1 for QUAL if( numAnnotations <= 0 ) { throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" ); } @@ -91,7 +89,7 @@ public class VariantDataManager { throw new StingException("Zero variance is a problem: standard deviation = " + theSTD); } meanVector[jjj] = theMean; - varianceVector[jjj] = theSTD * theSTD; + varianceVector[jjj] = theSTD; for( int iii = 0; iii < numVariants; iii++ ) { data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java index c232c08c2..647ddea52 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantDatum.java @@ -35,5 +35,5 @@ public class VariantDatum { public double[] annotations; public boolean isTransition; public boolean isKnown; - public double pTrue; + public double qual; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java index 0d7fe8494..329cffe87 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java @@ -7,7 +7,7 @@ import org.broadinstitute.sting.utils.xReadLines; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintStream; -import java.util.ArrayList; +import java.util.Map; import java.util.Random; import java.util.regex.Pattern; @@ -44,7 +44,7 @@ import java.util.regex.Pattern; public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel { - private final VariantDataManager dataManager; + public final VariantDataManager dataManager; private final int numGaussians; private final int numIterations; private final long RANDOM_SEED = 91801305; @@ -59,9 +59,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel private final int[] numMaxClusterNovel; private final double[] clusterTITV; private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio - //private final int desiredNumVariants; private final int minVarInCluster; + private static final double INFINITE_ANNOTATION_VALUE = 10000.0; private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); @@ -78,7 +78,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel numMaxClusterNovel = new int[numGaussians]; clusterTITV = new double[numGaussians]; clusterTruePositiveRate = new double[numGaussians]; - //desiredNumVariants = _desiredNumVariants; minVarInCluster = _minVarInCluster; } @@ -107,14 +106,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel numMaxClusterKnown = null; numMaxClusterNovel = null; clusterTITV = null; - clusterTruePositiveRate = null; - //desiredNumVariants = _desiredNumVariants; minVarInCluster = 0; numGaussians = clusterLines.size(); - mu = new double[numGaussians][]; - sigma = new double[numGaussians][]; + mu = new double[numGaussians][dataManager.numAnnotations]; + sigma = new double[numGaussians][dataManager.numAnnotations]; + clusterTruePositiveRate = new double[numGaussians]; + int kkk = 0; + for( String line : clusterLines ) { + String[] vals = line.split(","); + clusterTruePositiveRate[kkk] = Double.parseDouble(vals[4]); + for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { + mu[kkk][jjj] = Double.parseDouble(vals[5+jjj]); + sigma[kkk][jjj] = Double.parseDouble(vals[5+dataManager.numAnnotations+jjj]); + } + kkk++; + } + + System.out.println("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys); } public final void run( final String clusterFileName ) { @@ -278,27 +288,45 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } - /* - public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) { + public final double evaluateVariant( final Map annotationMap, final double qualityScore ) { + final double[] pVarInCluster = new double[numGaussians]; + final double[] annotations = new double[dataManager.numAnnotations]; + + for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) { + double value = 0.0; + if( dataManager.annotationKeys.get(jjj).equals("QUAL") ) { + value = qualityScore; + } else { + try { + final String stringValue = annotationMap.get( dataManager.annotationKeys.get(jjj) ); + if( stringValue != null ) { + value = Double.parseDouble( stringValue ); + if( Double.isInfinite(value) ) { + value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE; + } + } + } catch( NumberFormatException e ) { + // do nothing, default value is 0.0 + } + } + + annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj]; + } + + evaluateGaussiansForSingleVariant( annotations, pVarInCluster ); + + double sum = 0; + for( int kkk = 0; kkk < numGaussians; kkk++ ) { + sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk]; + } + + return sum; + } + + public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) { final int numVariants = data.length; - - final double[] pTrueVariant = new double[numVariants]; final boolean[] markedVariant = new boolean[numVariants]; - final double[] pVarInCluster = new double[numGaussians]; - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - for( int iii = 0; iii < numVariants; iii++ ) { - evaluateGaussiansForSingleVariant( data[iii], pVarInCluster ); - - pTrueVariant[iii] = 0.0; - markedVariant[iii] = false; - for( int kkk = 0; kkk < numGaussians; kkk++ ) { - pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk]; - } - } PrintStream outputFile = null; try { @@ -316,10 +344,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel int numNovelTv = 0; boolean foundDesiredNumVariants = false; outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV"); - for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) { + for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) { for( int iii = 0; iii < numVariants; iii++ ) { if( !markedVariant[iii] ) { - if( pTrueVariant[iii] >= pCut ) { + if( data[iii].qual >= pCut ) { markedVariant[iii] = true; if( data[iii].isKnown ) { // known numKnown++; @@ -340,7 +368,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } } if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) { - System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " ); + System.out.println( "Keeping variants with p(true) >= " + String.format("%.1f",pCut) + " results in a filtered set with: " ); System.out.println("\t" + numKnown + " known variants"); System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)"); System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv))); @@ -353,7 +381,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel ( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ))); } } - */ private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) { @@ -386,18 +413,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } - private void evaluateGaussiansForSingleVariant( final VariantDatum datum, final double[] pVarInCluster ) { + private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) { - final int numAnnotations = datum.annotations.length; + final int numAnnotations = annotations.length; double sumProb = 0.0; for( int kkk = 0; kkk < numGaussians; kkk++ ) { double sum = 0.0; for( int jjj = 0; jjj < numAnnotations; jjj++ ) { - sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) ) + sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) ) / sigma[kkk][jjj]; } - pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum ); + //BUGBUG: removed pCluster[kkk]*, for the second pass + //pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum ); + pVarInCluster[kkk] = Math.exp( -0.5 * sum ); if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem pVarInCluster[kkk] = MIN_PROB; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index 9ec266d8f..c3ba6bdb4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -53,8 +53,6 @@ public class VariantOptimizer extends RodWalker ///////////////////////////// @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true) private double TARGET_TITV = 2.1; - //@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false) - //private int DESIRED_NUM_VARIANTS = 0; @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) private boolean IGNORE_INPUT_FILTERS = false; @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false) @@ -74,10 +72,6 @@ public class VariantOptimizer extends RodWalker //private int NUM_KNN = 2000; //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL; - //@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) - //private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; - //@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - //private String PATH_TO_RESOURCES = "R/"; ///////////////////////////// // Private Member Variables @@ -115,7 +109,7 @@ public class VariantOptimizer extends RodWalker for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) { - if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) { + if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) { if( firstVariant ) { // This is the first variant encountered so set up the list of annotations annotationKeys.addAll( vc.getAttributes().keySet() ); if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field? @@ -203,21 +197,6 @@ public class VariantOptimizer extends RodWalker } theModel.run( CLUSTER_FILENAME ); - - - // Functionality moved to different walker - - // Execute Rscript command to plot the optimization curve - // Print out the command line to make it clear to the user what is being executed and how one might modify it - //final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV; - //System.out.println( rScriptCommandLine ); - - // Execute the RScript command to plot the table of truth values - //try { - // Runtime.getRuntime().exec( rScriptCommandLine ); - //} catch ( IOException e ) { - // throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine ); - //} } } diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 6dd6ebdbb..1a728a8a9 100755 --- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -56,7 +56,7 @@ public class QualityUtils { } /** - * Convert a probability to a quality score. Note, this is capped at Q40. + * Convert a probability to a quality score. Note, this is capped at a quality score which is determined by _eps_. * * @param prob a probability (0.0-1.0) * @param eps min probabilty allowed (0.0-1.0)