From 33a954989629cb17eeb85be9538490224772a679 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Sun, 9 May 2010 16:48:07 +0000 Subject: [PATCH] Variant Optimizer accepts a dbSNP rod arugment to use in determining known/novel status as opposed to using the rsID in the vcf record. VO generates plots of annotation values used in clustering broken out by knowns and novels. Useful for showing which annotations are approximately Gaussian. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3332 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_ClusterReport.R | 19 +++++++ .../ApplyVariantClustersWalker.java | 36 +++++++++--- .../VariantConcordanceROCCurveWalker.java | 2 + .../VariantGaussianMixtureModel.java | 57 ++++++++++++++++++- .../variantoptimizer/VariantOptimizer.java | 56 ++++++++++++++++-- 5 files changed, 156 insertions(+), 14 deletions(-) create mode 100755 R/plot_ClusterReport.R diff --git a/R/plot_ClusterReport.R b/R/plot_ClusterReport.R new file mode 100755 index 000000000..6a23b7808 --- /dev/null +++ b/R/plot_ClusterReport.R @@ -0,0 +1,19 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] + +data = read.table(input,sep=",",head=T) + +outfile = paste(input, ".ClusterReport.pdf", sep="") +pdf(outfile, height=7, width=8) + +maxP = max(data$knownDist, data$novelDist) + +plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs") +points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2) +legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue")) +dev.off() \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java index 399eaecb5..469a84a0e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrde import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; @@ -89,6 +90,7 @@ public class ApplyVariantClustersWalker extends RodWalker ignoreInputFilterSet = null; private final ArrayList ALLOWED_FORMAT_FIELDS = new ArrayList(); + private boolean usingDBSNP = false; //--------------------------------------------------------------------------------------------------------------- @@ -127,18 +129,25 @@ public class ApplyVariantClustersWalker extends RodWalker samples = new TreeSet(); - List dataSources = this.getToolkit().getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - RMDTrack rod = source.getReferenceOrderedData(); + final List dataSources = this.getToolkit().getRodDataSources(); + for ( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); if ( rod.getType().equals(VCFCodec.class) ) { - VCFReader reader = new VCFReader(rod.getFile()); - Set vcfSamples = reader.getHeader().getGenotypeSamples(); + final VCFReader reader = new VCFReader(rod.getFile()); + final Set vcfSamples = reader.getHeader().getGenotypeSamples(); samples.addAll(vcfSamples); reader.close(); } } final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); + + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + usingDBSNP = true; + } + } } //--------------------------------------------------------------------------------------------------------------- @@ -155,13 +164,22 @@ public class ApplyVariantClustersWalker extends RodWalker NUM_BINS-1) { histBin = NUM_BINS-1; } + if(histBin >= 0 && histBin <= NUM_BINS-1) { + counts[jjj][histBin][isKnown]++; + } + } + if( isKnown == 1 ) { totalCountsKnown++; } + else { totalCountsNovel++; } + } + + int annIndex = 0; + for( final String annotation : dataManager.annotationKeys ) { + PrintStream outputFile; + try { + outputFile = new PrintStream( outputPrefix + "." + annotation + ".dat" ); + } catch (Exception e) { + throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" ); + } + + outputFile.println("annotationValue,knownDist,novelDist"); + + for( int iii = 0; iii < NUM_BINS; iii++ ) { + final double annotationValue = (((double)iii * STD_STEP)+MIN_STD) * dataManager.varianceVector[annIndex] + dataManager.meanVector[annIndex]; + outputFile.println( annotationValue + "," + ( ((double)counts[annIndex][iii][1])/((double)totalCountsKnown) ) + + "," + ( ((double)counts[annIndex][iii][0])/((double)totalCountsNovel) )); + } + + annIndex++; + } + + // BUGBUG: next output the actual cluster on top by integrating out every other annotation + } + + public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) { final int numVariants = data.length; @@ -719,7 +774,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel } final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5); - pVarInCluster[kkk] = (1.0 / ((double) numGaussians)) * (Math.exp( -0.5 * sum )) / denom; + pVarInCluster[kkk] = pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom; /* if( isUsingTiTvModel ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java index 637fe7daf..0106917d7 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java @@ -28,14 +28,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.commandline.Argument; +import java.io.IOException; import java.util.Arrays; +import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -66,11 +71,16 @@ public class VariantOptimizer extends RodWalker @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true) private String CLUSTER_FILENAME = "optimizer.cluster"; @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false) - private int NUM_GAUSSIANS = 26; + private int NUM_GAUSSIANS = 7; @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false) private int NUM_ITERATIONS = 10; @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false) private int MIN_VAR_IN_CLUSTER = 1000; + @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false) + private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript"; + @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) + private String PATH_TO_RESOURCES = "R/"; + //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false) //private int NUM_KNN = 2000; @@ -85,6 +95,7 @@ public class VariantOptimizer extends RodWalker private int numAnnotations = 0; private static final double INFINITE_ANNOTATION_VALUE = 10000.0; private Set ignoreInputFilterSet = null; + private boolean usingDBSNP = false; //--------------------------------------------------------------------------------------------------------------- // @@ -96,6 +107,13 @@ public class VariantOptimizer extends RodWalker if( IGNORE_INPUT_FILTERS != null ) { ignoreInputFilterSet = new TreeSet(Arrays.asList(IGNORE_INPUT_FILTERS)); } + final List dataSources = this.getToolkit().getRodDataSources(); + for( final ReferenceOrderedDataSource source : dataSources ) { + final RMDTrack rod = source.getReferenceOrderedData(); + if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + usingDBSNP = true; + } + } } //--------------------------------------------------------------------------------------------------------------- @@ -116,7 +134,7 @@ public class VariantOptimizer extends RodWalker for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) { - if( vc != null && vc.isSNP() ) { + if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) { if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) { if( firstVariant ) { // This is the first variant encountered so set up the list of annotations annotationKeys.addAll( vc.getAttributes().keySet() ); @@ -162,7 +180,16 @@ public class VariantOptimizer extends RodWalker final VariantDatum variantDatum = new VariantDatum(); variantDatum.annotations = annotationValues; variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0; - variantDatum.isKnown = !vc.getAttribute("ID").equals("."); + boolean isKnown = !vc.getAttribute("ID").equals("."); + if(usingDBSNP) { + isKnown = false; + for( VariantContext dbsnpVC : tracker.getVariantContexts(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) { + if(dbsnpVC != null && dbsnpVC.isSNP()) { + isKnown=true; + } + } + } + variantDatum.isKnown = isKnown; mapList.add( variantDatum ); } @@ -198,7 +225,7 @@ public class VariantOptimizer extends RodWalker dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ] // Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it - VariantOptimizationModel theModel; + VariantGaussianMixtureModel theModel; switch (OPTIMIZATION_MODEL) { case GAUSSIAN_MIXTURE_MODEL: theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER ); @@ -211,6 +238,27 @@ public class VariantOptimizer extends RodWalker } theModel.run( CLUSTER_FILENAME ); + + theModel.outputClusterReports( CLUSTER_FILENAME ); + + + for( final String annotation : annotationKeys ) { + // Execute Rscript command to plot the optimization curve + // Print out the command line to make it clear to the user what is being executed and how one might modify it + final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation; + System.out.println( rScriptCommandLine ); + + // Execute the RScript command to plot the table of truth values + try { + final Process p = Runtime.getRuntime().exec( rScriptCommandLine ); + p.waitFor(); + } catch (InterruptedException e) { + e.printStackTrace(); + System.exit(-1); + } catch ( IOException e ) { + throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine ); + } + } } }