Added a new option for indicating the mean number of variants on the AnalyzeAnnotations plots. This way one can say, for example, filtering at this point will keep 75 percent of all the variants.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2744 348d0f76-0448-11de-a6fe-93d51630548a
2010-01-29 21:58:31 +00:00 · 2010-01-29 21:58:31 +00:00 · 16da5011c0
parent 668c7da33d
commit 16da5011c0
3 changed files with 40 additions and 14 deletions
--- a/R/plot_Annotations_BinnedTruthMetrics.R
+++ b/R/plot_Annotations_BinnedTruthMetrics.R
@ -6,6 +6,7 @@ verbose = TRUE
 input = args[1]
 annotationName = args[2]
 minBinCutoff = as.numeric(args[3])
+medianNumVariants = args[4]

 c <- read.table(input, header=T)

@ -14,11 +15,26 @@ novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
 dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
 truth = c[c$numVariants>minBinCutoff & c$category=="truth",]

+#
+# Calculate min, max, medians
+#
+
 d = c[c$numVariants>minBinCutoff,]
 ymin = min(d$titv)
 ymax = max(d$titv)
 xmin = min(d$value)
 xmax = max(d$value)
+m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
+ma = all[all$value > m,]
+mb = all[all$value < m,]
+m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
+m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
+if(medianNumVariants == "true") {
+vc = cumsum( all$numVariants/sum(all$numVariants) )
+m25 = all$value[ max(which(vc<=0.25)) ]
+m = all$value[ max(which(vc<=0.5)) ]
+m75 = all$value[ min(which(vc>=0.75)) ]
+}

 #
 # Plot TiTv ratio as a function of the annotation
@ -29,11 +45,6 @@ pdf(outfile, height=7, width=7)
 par(cex=1.1)
 plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
 axis(1,axTicks(1), format(axTicks(1), scientific=F))
-m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
-ma = all[all$value > m,]
-mb = all[all$value < m,]
-m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
-m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
 abline(v=m,lty=2)
 abline(v=m75,lty=2)
 abline(v=m25,lty=2)
@ -84,7 +95,6 @@ t = all[all$truePositive>0,]
 yLabel = "DBsnp/True Positive Rate"
 ymin = min(min(all$dbsnp),min(t$truePositive))
 ymax = max(max(all$dbsnp),max(t$truePositive))
-
 }
 plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
 axis(1,axTicks(1), format(axTicks(1), scientific=F))
@ -126,6 +136,17 @@ dev.off()
 outfile = paste(input, ".Histogram.pdf", sep="")
 pdf(outfile, height=7, width=7)
 par(cex=1.1)
-plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14);
+plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+dev.off()
+
+#
+# Plot histogram of the annotation's value, log scale on x-axis
+#
+
+outfile = paste(input, ".Histogram_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
 axis(1,axTicks(1), format(axTicks(1), scientific=F))
 dev.off()
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnalyzeAnnotationsWalker.java
@ -65,6 +65,9 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
    private String SAMPLE_NAME = null;
    @Argument(fullName = "name", shortName = "name", doc = "Labels for the annotations to make plots look nicer. Each name is a separate -name argument. For example, -name DP,Depth -name AB,AlleleBalance", required = false)
    private String[] ANNOTATION_NAMES = null;
+    @Argument(fullName = "indicate_mean_num_vars", shortName = "meanNumVars", doc = "If supplied, plots will indicate the distribution of number of variants instead of distribution of value of annotation", required = false)
+    private boolean INDICATE_MEAN_NUM_VARS = false;
+        


    /////////////////////////////
@ -89,7 +92,7 @@ public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
                nameMap.put(vals[0],vals[1]);
            }
        }
-        dataManager = new AnnotationDataManager( nameMap );
+        dataManager = new AnnotationDataManager( nameMap, INDICATE_MEAN_NUM_VARS );

        if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
    }
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/variantoptimizer/AnnotationDataManager.java
@ -42,12 +42,14 @@ import java.io.FileNotFoundException;

 public class AnnotationDataManager {

-    public final HashMap<String, TreeSet<AnnotationDatum>> data;
-    public final HashMap<String,String> nameMap;
+    private final HashMap<String, TreeSet<AnnotationDatum>> data;
+    private final HashMap<String,String> nameMap;
+    private final boolean INDICATE_MEAN_NUM_VARS;

-    public AnnotationDataManager( HashMap<String,String> _nameMap ) {
+    public AnnotationDataManager( HashMap<String,String> _nameMap, boolean _INDICATE_MEAN_NUM_VARS ) {
        data = new HashMap<String, TreeSet<AnnotationDatum>>();
        nameMap = _nameMap;
+        INDICATE_MEAN_NUM_VARS = _INDICATE_MEAN_NUM_VARS;
    }

    public void addAnnotations( final RodVCF variant, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
@ -62,7 +64,7 @@ public class AnnotationDataManager {
        final Map<String,String> infoField = variant.getInfoValues();
        for( String annotationKey : infoField.keySet() ) {

-            float value = 0.0f;
+            float value;
            try {
                value = Float.parseFloat( infoField.get( annotationKey ) );
            } catch( NumberFormatException e ) {
@ -140,13 +142,13 @@ public class AnnotationDataManager {
            output.close();
            
            String annotationName = nameMap.get(annotationKey);
-            if( annotationName == null ) { // name is not in the map so use the key
+            if( annotationName == null ) { // This annotation is not in the name map so use the key instead
                annotationName = annotationKey;
            }

            // Print out the command line to make it clear to the user what is being executed and how one might modify it
            final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
-                                   OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN;
+                                   OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN + " " + INDICATE_MEAN_NUM_VARS;
            System.out.println( rScriptCommandLine );

            // Execute the RScript command to plot the table of TiTv values