From 33a954989629cb17eeb85be9538490224772a679 Mon Sep 17 00:00:00 2001
From: rpoplin <rpoplin@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Sun, 9 May 2010 16:48:07 +0000
Subject: [PATCH] Variant Optimizer accepts a dbSNP rod arugment to use in
 determining known/novel status as opposed to using the rsID in the vcf
 record. VO generates plots of annotation values used in clustering broken out
 by knowns and novels. Useful for showing which annotations are approximately
 Gaussian.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3332 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/plot_ClusterReport.R                        | 19 +++++++
 .../ApplyVariantClustersWalker.java           | 36 +++++++++---
 .../VariantConcordanceROCCurveWalker.java     |  2 +
 .../VariantGaussianMixtureModel.java          | 57 ++++++++++++++++++-
 .../variantoptimizer/VariantOptimizer.java    | 56 ++++++++++++++++--
 5 files changed, 156 insertions(+), 14 deletions(-)
 create mode 100755 R/plot_ClusterReport.R
diff --git a/R/plot_ClusterReport.R b/R/plot_ClusterReport.R
new file mode 100755
index 000000000..6a23b7808
--- /dev/null
+++ b/R/plot_ClusterReport.R
@@ -0,0 +1,19 @@
+#!/broad/tools/apps/R-2.6.0/bin/Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+annotationName = args[2]
+
+data = read.table(input,sep=",",head=T)
+
+outfile = paste(input, ".ClusterReport.pdf", sep="")
+pdf(outfile, height=7, width=8)
+
+maxP = max(data$knownDist, data$novelDist)
+
+plot(data$annotationValue, data$knownDist, ylim=c(0,maxP),type="b",col="orange",lwd=2,xlab=annotationName,ylab="fraction of SNPs")
+points(data$annotationValue, data$novelDist, type="b",col="blue",lwd=2)
+legend('topright', c('knowns','novels'),lwd=2,col=c("orange","blue"))
+dev.off()
\ No newline at end of file
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
index 399eaecb5..469a84a0e 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/ApplyVariantClustersWalker.java
@@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrde
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
+import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
@@ -89,6 +90,7 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
     private VCFWriter vcfWriter;
     private Set<String> ignoreInputFilterSet = null;
     private final ArrayList<String> ALLOWED_FORMAT_FIELDS = new ArrayList<String>();
+    private boolean usingDBSNP = false;
 
 
     //---------------------------------------------------------------------------------------------------------------
@@ -127,18 +129,25 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
         hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
         vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
         final TreeSet<String> samples = new TreeSet<String>();
-        List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
-        for ( ReferenceOrderedDataSource source : dataSources ) {
-            RMDTrack rod = source.getReferenceOrderedData();
+        final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
+        for ( final ReferenceOrderedDataSource source : dataSources ) {
+            final RMDTrack rod = source.getReferenceOrderedData();
             if ( rod.getType().equals(VCFCodec.class) ) {
-                VCFReader reader = new VCFReader(rod.getFile());
-                Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
+                final VCFReader reader = new VCFReader(rod.getFile());
+                final Set<String> vcfSamples = reader.getHeader().getGenotypeSamples();
                 samples.addAll(vcfSamples);
                 reader.close();
             }
         }
         final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
         vcfWriter.writeHeader(vcfHeader);
+
+        for( final ReferenceOrderedDataSource source : dataSources ) {
+            final RMDTrack rod = source.getReferenceOrderedData();
+            if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
+                usingDBSNP = true;
+            }
+        }
     }
 
     //---------------------------------------------------------------------------------------------------------------
@@ -155,13 +164,22 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
         }
 
         for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
-            final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
-            if( vc != null && vc.isSNP() ) {
+            if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
+                final VCFRecord vcf = VariantContextAdaptors.toVCF(vc, ref.getBase(), ALLOWED_FORMAT_FIELDS, false, false);
                 if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
                     final VariantDatum variantDatum = new VariantDatum();
                     variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
-                    variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
-
+                    boolean isKnown = !vc.getAttribute("ID").equals(".");
+                    if(usingDBSNP) {
+                        isKnown = false;
+                        for( VariantContext dbsnpVC : tracker.getVariantContexts(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
+                            if(dbsnpVC != null && dbsnpVC.isSNP()) {
+                                isKnown=true;
+                            }
+                        }
+                    }
+                    variantDatum.isKnown = isKnown;
+                    
                     final double pTrue = theModel.evaluateVariant( vc.getAttributes(), vc.getPhredScaledQual() );
                     double recalQual = 400.0 * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) );
                     
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java
index 27ae29674..07cbfeae1 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantConcordanceROCCurveWalker.java
@@ -107,6 +107,8 @@ public class VariantConcordanceROCCurveWalker extends RodWalker<ExpandingArrayLi
             trueNegGlobal[kkk] = 0;
             falseNegGlobal[kkk] = 0;
         }
+
+        
     }
 
     //---------------------------------------------------------------------------------------------------------------
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
index 0a1d301ec..90ccfd5f5 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantGaussianMixtureModel.java
@@ -506,6 +506,61 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
         */
     }
 
+   public final void outputClusterReports( final String outputPrefix ) {
+        final double STD_STEP = 0.2;
+        final double MAX_STD = 4.0;
+        final double MIN_STD = -4.0;
+        final int NUM_BINS = (int)Math.floor((Math.abs(MIN_STD) + Math.abs(MAX_STD)) / STD_STEP);
+        final int numAnnotations = dataManager.numAnnotations;
+        int totalCountsKnown = 0;
+        int totalCountsNovel = 0;
+
+        final int counts[][][] = new int[numAnnotations][NUM_BINS][2];
+        for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+            for( int iii = 0; iii < NUM_BINS; iii++ ) {
+                counts[jjj][iii][0] = 0;
+                counts[jjj][iii][1] = 0;
+            }
+        }
+
+        for( VariantDatum datum : dataManager.data ) {
+            final int isKnown = ( datum.isKnown ? 1 : 0 );
+            for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
+                int histBin = (int)Math.round((datum.annotations[jjj]-MIN_STD) * (1.0 / STD_STEP));
+                if(histBin < 0) { histBin = 0; }
+                if(histBin > NUM_BINS-1) { histBin = NUM_BINS-1; }
+                if(histBin >= 0 && histBin <= NUM_BINS-1) {
+                    counts[jjj][histBin][isKnown]++;
+                }
+            }
+            if( isKnown == 1 ) { totalCountsKnown++; }
+            else { totalCountsNovel++; }
+        }
+
+        int annIndex = 0;
+        for( final String annotation : dataManager.annotationKeys ) {
+            PrintStream outputFile;
+            try {
+                outputFile = new PrintStream( outputPrefix + "." + annotation + ".dat" );
+            } catch (Exception e) {
+                throw new StingException( "Unable to create output file: " + outputPrefix + ".dat" );
+            }
+
+            outputFile.println("annotationValue,knownDist,novelDist");
+
+            for( int iii = 0; iii < NUM_BINS; iii++ ) {
+                final double annotationValue = (((double)iii * STD_STEP)+MIN_STD) * dataManager.varianceVector[annIndex] + dataManager.meanVector[annIndex];
+                outputFile.println( annotationValue + "," + ( ((double)counts[annIndex][iii][1])/((double)totalCountsKnown) ) +
+                                                      "," + ( ((double)counts[annIndex][iii][0])/((double)totalCountsNovel) ));
+            }
+
+            annIndex++;
+        }
+
+       // BUGBUG: next output the actual cluster on top by integrating out every other annotation
+    }
+
+
     public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
 
         final int numVariants = data.length;
@@ -719,7 +774,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
             }
 
             final double denom = Math.pow(2.0 * 3.14159, ((double)numAnnotations) / 2.0) * Math.pow(determinant[kkk], 0.5);
-            pVarInCluster[kkk] =  (1.0 / ((double) numGaussians)) * (Math.exp( -0.5 * sum )) / denom;
+            pVarInCluster[kkk] =  pCluster[kkk] * (Math.exp( -0.5 * sum )) / denom;
 
             /*
             if( isUsingTiTvModel ) {
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
index 637fe7daf..0106917d7 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/VariantOptimizer.java
@@ -28,14 +28,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
+import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
 import org.broadinstitute.sting.utils.StingException;
 import org.broadinstitute.sting.commandline.Argument;
 
+import java.io.IOException;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
 
@@ -66,11 +71,16 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
     @Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
     private String CLUSTER_FILENAME = "optimizer.cluster";
     @Argument(fullName="numGaussians", shortName="nG", doc="The number of Gaussians to be used in the Gaussian Mixture model", required=false)
-    private int NUM_GAUSSIANS = 26;
+    private int NUM_GAUSSIANS = 7;
     @Argument(fullName="numIterations", shortName="nI", doc="The number of iterations to be performed in the Gaussian Mixture model", required=false)
     private int NUM_ITERATIONS = 10;
     @Argument(fullName="minVarInCluster", shortName="minVar", doc="The minimum number of variants in a cluster to be considered a valid cluster. It can be used to prevent overfitting.", required=false)
     private int MIN_VAR_IN_CLUSTER = 1000;
+    @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
+    private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
+    @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
+    private String PATH_TO_RESOURCES = "R/";
+    
 
     //@Argument(fullName="knn", shortName="knn", doc="The number of nearest neighbors to be used in the k-Nearest Neighbors model", required=false)
     //private int NUM_KNN = 2000;
@@ -85,6 +95,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
     private int numAnnotations = 0;
     private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
     private Set<String> ignoreInputFilterSet = null;
+    private boolean usingDBSNP = false;
 
     //---------------------------------------------------------------------------------------------------------------
     //
@@ -96,6 +107,13 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
         if( IGNORE_INPUT_FILTERS != null ) {
             ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
         }
+        final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
+        for( final ReferenceOrderedDataSource source : dataSources ) {
+            final RMDTrack rod = source.getReferenceOrderedData();
+            if ( rod.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
+                usingDBSNP = true;
+            }
+        }
     }
 
     //---------------------------------------------------------------------------------------------------------------
@@ -116,7 +134,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
 
         for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
 
-            if( vc != null && vc.isSNP() ) {
+            if( vc != null && !vc.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) && vc.isSNP() ) {
                 if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
                     if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
                         annotationKeys.addAll( vc.getAttributes().keySet() );
@@ -162,7 +180,16 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
                     final VariantDatum variantDatum = new VariantDatum();
                     variantDatum.annotations = annotationValues;
                     variantDatum.isTransition = vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
-                    variantDatum.isKnown = !vc.getAttribute("ID").equals(".");
+                    boolean isKnown = !vc.getAttribute("ID").equals(".");
+                    if(usingDBSNP) {
+                        isKnown = false;
+                        for( VariantContext dbsnpVC : tracker.getVariantContexts(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, null, context.getLocation(), false, false) ) {
+                            if(dbsnpVC != null && dbsnpVC.isSNP()) {
+                                isKnown=true;
+                            }
+                        }
+                    }
+                    variantDatum.isKnown = isKnown;
 
                     mapList.add( variantDatum );
                 }
@@ -198,7 +225,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
         dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
         
         // Create either the Gaussian Mixture Model or the Nearest Neighbors model and run it
-        VariantOptimizationModel theModel;
+        VariantGaussianMixtureModel theModel;
         switch (OPTIMIZATION_MODEL) {
             case GAUSSIAN_MIXTURE_MODEL:
                 theModel = new VariantGaussianMixtureModel( dataManager, TARGET_TITV, NUM_GAUSSIANS, NUM_ITERATIONS, MIN_VAR_IN_CLUSTER );
@@ -211,6 +238,27 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
         }
         
         theModel.run( CLUSTER_FILENAME );
+
+        theModel.outputClusterReports( CLUSTER_FILENAME );
+
+
+        for( final String annotation : annotationKeys ) {
+            // Execute Rscript command to plot the optimization curve
+            // Print out the command line to make it clear to the user what is being executed and how one might modify it
+            final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_ClusterReport.R" + " " + CLUSTER_FILENAME + "." + annotation + ".dat " + annotation;
+            System.out.println( rScriptCommandLine );
+
+            // Execute the RScript command to plot the table of truth values
+            try {
+                final Process p = Runtime.getRuntime().exec( rScriptCommandLine );
+                p.waitFor();
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+                System.exit(-1);
+            } catch ( IOException e ) {
+                throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
+            }
+        }
     }
 
 }