From 9254faa27e6baede2d1847ce1de33deb0b570414 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Thu, 16 Jun 2011 12:46:29 +0000
Subject: [PATCH] Added density plots by sample for each metric.  New command
 line argument ordering.  No longer requires the per-sample.tsv suppl. data --
 will conditionally load if available

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6003 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/exomeQC.R | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/R/exomeQC.R b/R/exomeQC.R
index 144326df1..8704b4fef 100644
--- a/R/exomeQC.R
+++ b/R/exomeQC.R
@@ -16,11 +16,13 @@ parseHighlightSamples <- function(s) {
   return(unlist(strsplit(s, ",", fixed=T)))
 }
 
+preQCFile = NA
 if ( onCMDLine ) {
   ProjectName = args[1]
   VariantEvalRoot = args[2]
-  preQCFile = args[3]
-  outputPDF = args[4]
+  outputPDF = args[3]
+  if ( ! is.na(args[4]) ) 
+    preQCFile = args[4]
   if ( ! is.na(args[5]) ) 
     highlightSamples = parseHighlightSamples(args[5])
   else
@@ -33,6 +35,13 @@ if ( onCMDLine ) {
   highlightSamples = c() # parseHighlightSamples("29029,47243")
 }
 
+print("Report")
+print(paste("Project          :", ProjectName))
+print(paste("VariantEvalRoot  :", VariantEvalRoot))
+print(paste("outputPDF        :", outputPDF))
+print(paste("preQCFile        :", preQCFile))
+print(paste("highlightSamples :", highlightSamples))
+
 expandVEReport <- function(d) {
   d$TiTvVariantEvaluator$tiTvRatio = round(d$TiTvVariantEvaluator$tiTvRatio,2) 
   d$CountVariants$deletionInsertionRatio = round(d$CountVariants$deletionInsertionRatio,2) 
@@ -72,7 +81,8 @@ summaryPlots <- function(metricsBySites) {
   p <- ggplot(data=molten, aes(x=AC, y=value+1, color=Novelty, fill=Novelty), group=variable)
   p <- p + opts(title = name)
   p <- p + scale_y_log10("Number of variants")
-  p <- p + geom_area()
+  p <- p + geom_point(alpha=0.5, size=3)
+  p <- p + geom_line(size=1)
   p <- p + facet_grid(variable ~ ., scales="free")
   p <- p + scale_x_continuous("Allele count (AC)")
   print(p)
@@ -134,9 +144,11 @@ addSection <- function(name) {
  
 createMetricsBySamples <- function(VariantEvalRoot) {
   byAFEval <- expandVEReport(gsa.read.gatkreport(paste(VariantEvalRoot, ".bySample.eval", sep="")))
-  preQCMetrics <- read.table(preQCFile, header=T) 
-  r = merge(merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants), preQCMetrics)
-
+  r = merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants)
+  if ( ! is.na(preQCFile) ) {
+    preQCMetrics <- read.table(preQCFile, header=T) 
+    r = merge(merge(byAFEval$TiTvVariantEvaluator, byAFEval$CountVariants), preQCMetrics)
+  }
   # order the samples by nSNPs -- it's the natural ordering.
   x = subset(r, Novelty=="all")
   r$Sample <- factor(x$Sample, levels=x$Sample[order(x$nSNPs)])
@@ -148,12 +160,6 @@ createMetricsBySamples <- function(VariantEvalRoot) {
   return(subset(r, Sample != "all"))
 }
 
-# actually load the data.
-if ( onCMDLine || LOAD_DATA ) {
-  metricsBySites <- createMetricsBySites(VariantEvalRoot)
-  metricsBySamples <- createMetricsBySamples(VariantEvalRoot)
-}
-
 # -------------------------------------------------------
 # Per sample plots
 # -------------------------------------------------------
@@ -167,12 +173,21 @@ perSamplePlots <- function(metricsBySamples) {
   #myRug <- geom_rug(aes(x = NULL))
 
   measures = c("nSNPs", "tiTvRatio", "nSingletons", "nIndels", "nInsertions", "nDeletions", "deletionInsertionRatio")
-  # Counts vs. Allele frequency 
-  name = "Variant counts by allele count"
+  name = "by sample"
   for ( measure in measures ) {
     molten = melt(metricsBySamples, id.vars=c("Novelty", "Sample", "highlightTextSizes"), measure.vars=c(measure))
+
+    # distribution
+    p <- ggplot(data=molten, aes(x=value, group=Novelty, fill=Novelty))
+    p <- p + opts(title = paste(measure, name))
+    p <- p + geom_density(alpha=0.5)
+    p <- p + geom_rug(aes(y=NULL, color=Novelty, position="jitter"))
+    p <- p + scale_x_continuous(measure)
+    # how do we remove the labels?
+    print(p)
+
     p <- ggplot(data=molten, aes(x=Sample, y=value, group=Novelty, color=Novelty), y=value)
-    p <- p + opts(title = paste(name, ":", measure))
+    p <- p + opts(title = paste(measure, name))
     p <- p + geom_smooth(alpha=0.5, aes(group=Novelty))
     p <- p + sampleTextLabel + sampleTextLabelScale
     p <- p + myRug
@@ -203,6 +218,12 @@ perSamplePlots <- function(metricsBySamples) {
 # Actually invoke the above plotting functions 
 # -------------------------------------------------------
 
+# load the data.
+if ( onCMDLine || LOAD_DATA ) {
+  metricsBySites <- createMetricsBySites(VariantEvalRoot)
+  metricsBySamples <- createMetricsBySamples(VariantEvalRoot)
+}
+
 if ( ! is.na(outputPDF) ) {
   pdf(outputPDF, height=8.5, width=11)
 }