Change to support plotting of indel quality as a function of covariates - for now, just call different R calling script.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5053 348d0f76-0448-11de-a6fe-93d51630548a
2011-01-22 14:09:23 +00:00 · 2011-01-22 14:09:23 +00:00 · a50d7f74fa
parent fa0c476b82
commit a50d7f74fa
2 changed files with 129 additions and 10 deletions
--- a/R/plot_indelQuality.R
+++ b/R/plot_indelQuality.R
@ -0,0 +1,108 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+covariateName = args[2]
+
+outfile = paste(input, ".indelQual_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+c <- read.table(input, header=T)
+c <- c[sort.list(c[,1]),]
+
+#
+# Plot qual as a function of the covariate
+#
+
+d.good <- c[c$nBases >= 1000,]
+d.1000 <- c[c$nBases < 1000,]
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(d.good$nBases) == length(c$nBases) ) {
+	theTitle = paste("RMSE =", round(rmseAll,digits=3))
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical#-d.good$Qreported
+#d.good$residualError[which(d.good$residualError > 10)] = 10
+#d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical#-d.1000$Qreported
+#d.1000$residualError[which(d.1000$residualError > 10)] = 10
+#d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical
+#c$residualError[which(c$residualError > 10)] = 10
+#c$residualError[which(c$residualError < -10)] = -10
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-0, 50), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical Indel Quality", xlab=covariateName, col="blue", ylim=c(0, 50))
+	points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
+}
+dev.off()
+
+
+#
+# Plot mean quality versus the covariate
+#
+
+outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+theTitle = paste("Quality By", covariateName);
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
+	points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
+}
+dev.off()
+
+#
+# Plot histogram of the covariate
+#
+
+e = d.good
+f = d.1000
+outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
+
+lwdSize=2
+if( length(c$Covariate) <= 20 ) {
+    lwdSize=7
+} else if( length(c$Covariate) <= 70 ) {
+    lwdSize=4
+}
+
+if( is.numeric(c$Covariate) ) {
+    if( length(hst$e.Covariate) == 0 ) {
+        plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+    } else {
+	    plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+	    points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
+	plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
+	if( length(hst$c.Covariate) > 9 ) {
+	    axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
+	} else {
+	    axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+}
+dev.off()
--- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@ -70,6 +70,8 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
    private int MAX_QUALITY_SCORE = 50;
    @Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
    private int MAX_HISTOGRAM_VALUE = 0;
+    @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
+    private boolean DO_INDEL_QUALITY = false;


    /////////////////////////////
@ -274,17 +276,26 @@ class AnalyzeCovariatesCLP extends CommandLineProgram {
                for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
                    Covariate cov = requestedCovariates.get(iii);
                    try {
+
+                        if (DO_INDEL_QUALITY) {
+                            p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_indelQuality.R" + " " +
+                                         OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
+                                         cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
+                             p.waitFor();
+                            
+                        }   else {
                        if( iii == 1 ) {
-                            // Analyze reported quality
-                            p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
-                                        OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
-                                        IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
-                            p.waitFor();
-                        } else { // Analyze all other covariates
-                            p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
-                                        OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
-                                        cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
-                            p.waitFor();
+                                // Analyze reported quality
+                                p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_QualityScoreCovariate.R" + " " +
+                                            OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
+                                            IGNORE_QSCORES_LESS_THAN + " " + MAX_QUALITY_SCORE + " " + MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored
+                                p.waitFor();
+                            } else { // Analyze all other covariates
+                                p = Runtime.getRuntime().exec(PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_residualError_OtherCovariate.R" + " " +
+                                            OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat" + " " +
+                                            cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice
+                                p.waitFor();
+                            }
                        }
                    } catch (InterruptedException e) {
                        e.printStackTrace();