From f4ae6edb9240ee80debba64693c678d782b23ca1 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Thu, 30 Jun 2011 14:55:25 -0400
Subject: [PATCH] Moving some of the released R scripts into public from
 private

---
 .../R/plot_Annotations_BinnedTruthMetrics.R   | 190 ++++++++++++++++++
 public/R/plot_Tranches.R                      |  87 ++++++++
 public/R/plot_residualError_OtherCovariate.R  | 108 ++++++++++
 ...plot_residualError_QualityScoreCovariate.R |  70 +++++++
 public/R/titvFPEst.R                          | 138 +++++++++++++
 5 files changed, 593 insertions(+)
 create mode 100644 public/R/plot_Annotations_BinnedTruthMetrics.R
 create mode 100755 public/R/plot_Tranches.R
 create mode 100644 public/R/plot_residualError_OtherCovariate.R
 create mode 100644 public/R/plot_residualError_QualityScoreCovariate.R
 create mode 100755 public/R/titvFPEst.R

diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R
new file mode 100644
index 000000000..9f9ee290c
--- /dev/null
+++ b/public/R/plot_Annotations_BinnedTruthMetrics.R
@@ -0,0 +1,190 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+annotationName = args[2]
+minBinCutoff = as.numeric(args[3])
+medianNumVariants = args[4]
+
+c <- read.table(input, header=T)
+
+all = c[c$numVariants>minBinCutoff & c$category=="all",]
+novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
+dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
+truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
+
+#
+# Calculate min, max, medians
+#
+
+d = c[c$numVariants>minBinCutoff,]
+ymin = min(d$titv)
+ymax = max(d$titv)
+xmin = min(d$value)
+xmax = max(d$value)
+m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
+ma = all[all$value > m,]
+mb = all[all$value < m,]
+m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
+m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
+if(medianNumVariants == "true") {
+vc = cumsum( all$numVariants/sum(all$numVariants) )
+m10 = all$value[ max(which(vc<=0.10)) ]
+m25 = all$value[ max(which(vc<=0.25)) ]
+m = all$value[ max(which(vc<=0.5)) ]
+m75 = all$value[ min(which(vc>=0.75)) ]
+m90 = all$value[ min(which(vc>=0.90)) ]
+}
+
+#
+# Plot TiTv ratio as a function of the annotation
+#
+
+outfile = paste(input, ".TiTv.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+points(novel$value,novel$titv,col="green",pch=20)
+points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
+legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
+dev.off()
+
+#
+# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
+#
+
+outfile = paste(input, ".TiTv_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+points(novel$value,novel$titv,col="green",pch=20)
+points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(truth$value,truth$titv,col="magenta",pch=20)
+legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
+} else {
+legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
+}
+dev.off()
+
+#
+# Plot dbsnp and true positive rate as a function of the annotation
+#
+
+ymin = min(all$dbsnp)
+ymax = max(all$dbsnp)
+outfile = paste(input, ".truthRate.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+t = all[all$truePositive>0,]
+yLabel = "DBsnp/True Positive Rate"
+ymin = min(min(all$dbsnp),min(t$truePositive))
+ymax = max(max(all$dbsnp),max(t$truePositive))
+}
+plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(t$value,t$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
+dev.off()
+
+#
+# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
+#
+
+outfile = paste(input, ".truthRate_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+yLabel = "DBsnp Rate"
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+yLabel = "DBsnp/Truth Rate"
+}
+plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+abline(v=m,lty=2,col="red")
+abline(v=m75,lty=3)
+abline(v=m25,lty=3)
+text(m, ymin, "50", col="red", cex=0.6);
+text(m75, ymin, "75", col="black", cex=0.6);
+text(m25, ymin, "25", col="black", cex=0.6);
+if(medianNumVariants == "true") {
+abline(v=m90,lty=3)
+abline(v=m10,lty=3)
+text(m10, ymin, "10", col="black", cex=0.6);
+text(m90, ymin, "90", col="black", cex=0.6);
+}
+if( sum(all$truePositive==0) != length(all$truePositive) ) {
+points(t$value,t$truePositive,col="magenta",pch=20);
+legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
+}
+dev.off()
+
+#
+# Plot histogram of the annotation's value
+#
+
+outfile = paste(input, ".Histogram.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+dev.off()
+
+#
+# Plot histogram of the annotation's value, log scale on x-axis
+#
+
+outfile = paste(input, ".Histogram_log.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
+axis(1,axTicks(1), format(axTicks(1), scientific=F))
+dev.off()
diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R
new file mode 100755
index 000000000..a79ddd3ab
--- /dev/null
+++ b/public/R/plot_Tranches.R
@@ -0,0 +1,87 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+tranchesFile = args[1]
+targetTITV = as.numeric(args[2])
+targetSensitivity = as.numeric(args[3])
+suppressLegend = ! is.na(args[4])
+
+# -----------------------------------------------------------------------------------------------
+# Useful general routines
+# -----------------------------------------------------------------------------------------------
+
+MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate 
+
+titvFPEst <- function(titvExpected, titvObserved) { 
+    max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) 
+}
+
+titvFPEstV <- function(titvExpected, titvs) {
+    sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+nTPFP <- function(nVariants, FDR) {
+    return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
+}
+
+leftShift <- function(x, leftValue = 0) {
+    r = rep(leftValue, length(x))
+    for ( i in 1:(length(x)-1) ) {
+        #print(list(i=i))
+        r[i] = x[i+1]
+    }
+    r
+}
+
+# -----------------------------------------------------------------------------------------------
+# Tranches plot
+# -----------------------------------------------------------------------------------------------
+data2 = read.table(tranchesFile,sep=",",head=T)
+data2 = data2[order(data2$novelTiTv, decreasing=F),]
+#data2 = data2[order(data2$FDRtranche, decreasing=T),]
+cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
+density=c(20, -1, -1, 20)
+outfile = paste(tranchesFile, ".pdf", sep="")
+pdf(outfile, height=5, width=8)
+par(mar = c(5, 5, 4, 2) + 0.1)
+novelTiTv = c(data2$novelTITV,data2$novelTiTv)
+alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
+#print(alpha)
+
+numGood = round(alpha * data2$numNovel);
+
+#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
+numBad = data2$numNovel - numGood;
+
+numPrevGood = leftShift(numGood, 0)
+numNewGood = numGood - numPrevGood
+numPrevBad = leftShift(numBad, 0)
+numNewBad = numBad - numPrevBad
+
+d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
+#print(d)
+barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
+#abline(v= d[2,dim(d)[2]], lty=2)
+#abline(v= d[1,3], lty=2)
+if ( ! suppressLegend ) 
+    legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
+
+mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
+axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
+axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
+
+# plot sensitivity vs. specificity
+sensitivity = data2$truthSensitivity
+if ( ! is.null(sensitivity) ) {
+    #specificity = titvFPEstV(targetTITV, novelTiTv)
+    specificity = novelTiTv
+    plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
+    abline(h=targetTITV, lty=2)
+    abline(v=targetSensitivity, lty=2)
+    #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
+}
+
+dev.off()
diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R
new file mode 100644
index 000000000..a1385ff3f
--- /dev/null
+++ b/public/R/plot_residualError_OtherCovariate.R
@@ -0,0 +1,108 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+covariateName = args[2]
+
+outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+c <- read.table(input, header=T)
+c <- c[sort.list(c[,1]),]
+
+#
+# Plot residual error as a function of the covariate
+#
+
+d.good <- c[c$nBases >= 1000,]
+d.1000 <- c[c$nBases < 1000,]
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(d.good$nBases) == length(c$nBases) ) {
+	theTitle = paste("RMSE =", round(rmseAll,digits=3))
+}
+# Don't let residual error go off the edge of the plot
+d.good$residualError = d.good$Qempirical-d.good$Qreported
+d.good$residualError[which(d.good$residualError > 10)] = 10
+d.good$residualError[which(d.good$residualError < -10)] = -10
+d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
+d.1000$residualError[which(d.1000$residualError > 10)] = 10
+d.1000$residualError[which(d.1000$residualError < -10)] = -10
+c$residualError = c$Qempirical-c$Qreported
+c$residualError[which(c$residualError > 10)] = 10
+c$residualError[which(c$residualError < -10)] = -10
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
+	points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
+}
+dev.off()
+
+
+#
+# Plot mean quality versus the covariate
+#
+
+outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+pointType = "p"
+if( length(c$Covariate) <= 20 ) {
+    pointType = "o"
+}
+theTitle = paste("Quality By", covariateName);
+if( is.numeric(c$Covariate) ) {
+	plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
+	points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
+	points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
+}
+dev.off()
+
+#
+# Plot histogram of the covariate
+#
+
+e = d.good
+f = d.1000
+outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
+
+lwdSize=2
+if( length(c$Covariate) <= 20 ) {
+    lwdSize=7
+} else if( length(c$Covariate) <= 70 ) {
+    lwdSize=4
+}
+
+if( is.numeric(c$Covariate) ) {
+    if( length(hst$e.Covariate) == 0 ) {
+        plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+    } else {
+	    plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
+	    points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
+	hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
+	plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
+	if( length(hst$c.Covariate) > 9 ) {
+	    axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
+	} else {
+	    axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
+	}
+	axis(2,axTicks(2), format(axTicks(2), scientific=F))
+}
+dev.off()
diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R
new file mode 100644
index 000000000..81bc9460d
--- /dev/null
+++ b/public/R/plot_residualError_QualityScoreCovariate.R
@@ -0,0 +1,70 @@
+#!/bin/env Rscript
+
+args <- commandArgs(TRUE)
+
+input = args[1]
+Qcutoff = as.numeric(args[2])
+maxQ = as.numeric(args[3])
+maxHist = as.numeric(args[4])
+
+t=read.table(input, header=T)
+
+#
+# Plot of reported quality versus empirical quality
+#
+
+outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
+pdf(outfile, height=7, width=7)
+d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
+d.1000 <- t[t$nBases < 1000  & t$Qreported >= Qcutoff,]
+d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000  & t$Qreported >= Qcutoff,]
+f <- t[t$Qreported < Qcutoff,]
+e <- rbind(d.good, d.1000, d.10000)
+rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
+rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
+theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
+if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
+	theTitle = paste("RMSE =", round(rmseAll,digits=3));
+}
+plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
+points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
+points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
+points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
+abline(0,1, lty=2)
+dev.off()
+
+#
+# Plot Q empirical histogram
+#
+
+outfile = paste(input, ".quality_emp_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
+percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
+entropy = -sum(log2(percentBases)*percentBases)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
+
+#
+# Plot Q reported histogram
+#
+
+outfile = paste(input, ".quality_rep_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
+hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
+yMax = max(hst$e.nBases)
+if(maxHist != 0) {
+yMax = maxHist
+}
+plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
+points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R
new file mode 100755
index 000000000..7af5e8bbb
--- /dev/null
+++ b/public/R/titvFPEst.R
@@ -0,0 +1,138 @@
+titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
+
+titvFPEstV <- function(titvExpected, titvs) {
+    sapply(titvs, function(x) titvFPEst(titvExpected, x))
+}
+
+calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
+  TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
+  2 * TP / 3 / callable
+}
+
+marginalTiTv <- function( nx, titvx, ny, titvy ) {
+    tvx = nx / (titvx + 1)
+    tix = nx - tvx
+    tvy = ny / (titvy + 1)
+    tiy = ny - tvy
+    tiz = tix - tiy
+    tvz = tvx - tvy
+    return(tiz / tvz)
+}
+marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
+    knownx = nx * dbx / 100
+    novelx = nx - knownx
+    knowny = ny * dby / 100
+    novely = ny - knowny
+    knownz = knownx - knowny
+    novelz = novelx - novely
+    return(knownz / ( knownz + novelz ) * 100)
+}
+
+numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
+    nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
+    return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
+}
+
+normalize <- function(x) {
+    x / sum(x)
+}
+
+normcumsum <- function(x) {
+    cumsum(normalize(x))
+}
+
+cumhist <- function(d, ...) {
+    plot(d[order(d)], type="b", col="orange", lwd=2, ...)
+}
+
+revcumsum <- function(x) {
+   return(rev(cumsum(rev(x))))
+}
+
+phred <- function(x) {
+    log10(max(x,10^(-9.9)))*-10
+}
+
+pOfB <- function(b, B, Q) {
+    #print(paste(b, B, Q))
+    p = 1 - 10^(-Q/10)
+    if ( b == B )
+        return(p) 
+    else
+        return(1 - p)
+}
+
+pOfG <- function(bs, qs, G) {
+    a1 = G[1]
+    a2 = G[2]
+    
+    log10p = 0
+    for ( i in 1:length(bs) ) {
+        b = bs[i]
+        q = qs[i]
+        p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
+        log10p = log10p + log10(p1)
+    }
+    
+    return(log10p)
+}
+
+pOfGs <- function(nAs, nBs, Q) {
+    bs = c(rep("a", nAs), rep("t", nBs))
+    qs = rep(Q, nAs + nBs)
+    G1 = c("a", "a")
+    G2 = c("a", "t")
+    G3 = c("t", "t")
+
+    log10p1 = pOfG(bs, qs, G1)
+    log10p2 = pOfG(bs, qs, G2)
+    log10p3 = pOfG(bs, qs, G3)
+    Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
+    
+    return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) 
+}
+
+QsampleExpected <- function(depth, Q) {
+    weightedAvg = 0
+    for ( d in 1:(depth*3) ) {
+        Qsample = 0
+        pOfD = dpois(d, depth)
+        for ( nBs in 0:d ) {
+            pOfnB = dbinom(nBs, d, 0.5)
+            nAs = d - nBs
+            Qsample = pOfGs(nAs, nBs, Q)$Qsample
+            #Qsample = 1
+            weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
+            print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
+        }
+    }
+    
+    return(weightedAvg)
+}
+
+plotQsamples <- function(depths, Qs, Qmax) {
+    cols = rainbow(length(Qs))
+    plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
+    
+    for ( i in 1:length(Qs) ) {
+        Q = Qs[i]
+        y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
+        points(depths, y, col=cols[i], type="b")
+    }
+
+    legend("topleft", paste("Q", Qs), fill=cols)
+}
+
+pCallHetGivenDepth <- function(depth, nallelesToCall) {
+  depths = 0:(2*depth)   
+  pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
+  dpois(depths,depth)*(1-pNoAllelesToCall)
+}
+
+pCallHets <- function(depth, nallelesToCall) {
+  sum(pCallHetGivenDepth(depth,nallelesToCall))
+}
+
+pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
+  1-(1-pCallHets(depth,nallelesToCall))^nsamples
+}