From f4ae6edb9240ee80debba64693c678d782b23ca1 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Jun 2011 14:55:25 -0400 Subject: [PATCH] Moving some of the released R scripts into public from private --- .../R/plot_Annotations_BinnedTruthMetrics.R | 190 ++++++++++++++++++ public/R/plot_Tranches.R | 87 ++++++++ public/R/plot_residualError_OtherCovariate.R | 108 ++++++++++ ...plot_residualError_QualityScoreCovariate.R | 70 +++++++ public/R/titvFPEst.R | 138 +++++++++++++ 5 files changed, 593 insertions(+) create mode 100644 public/R/plot_Annotations_BinnedTruthMetrics.R create mode 100755 public/R/plot_Tranches.R create mode 100644 public/R/plot_residualError_OtherCovariate.R create mode 100644 public/R/plot_residualError_QualityScoreCovariate.R create mode 100755 public/R/titvFPEst.R diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R new file mode 100644 index 000000000..9f9ee290c --- /dev/null +++ b/public/R/plot_Annotations_BinnedTruthMetrics.R @@ -0,0 +1,190 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] +minBinCutoff = as.numeric(args[3]) +medianNumVariants = args[4] + +c <- read.table(input, header=T) + +all = c[c$numVariants>minBinCutoff & c$category=="all",] +novel = c[c$numVariants>minBinCutoff & c$category=="novel",] +dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] + +# +# Calculate min, max, medians +# + +d = c[c$numVariants>minBinCutoff,] +ymin = min(d$titv) +ymax = max(d$titv) +xmin = min(d$value) +xmax = max(d$value) +m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) +ma = all[all$value > m,] +mb = all[all$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +if(medianNumVariants == "true") { +vc = cumsum( all$numVariants/sum(all$numVariants) ) +m10 = all$value[ max(which(vc<=0.10)) ] +m25 = all$value[ max(which(vc<=0.25)) ] +m = all$value[ max(which(vc<=0.5)) ] +m75 = all$value[ min(which(vc>=0.75)) ] +m90 = all$value[ min(which(vc>=0.90)) ] +} + +# +# Plot TiTv ratio as a function of the annotation +# + +outfile = paste(input, ".TiTv.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot TiTv ratio as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".TiTv_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation +# + +ymin = min(all$dbsnp) +ymax = max(all$dbsnp) +outfile = paste(input, ".truthRate.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +t = all[all$truePositive>0,] +yLabel = "DBsnp/True Positive Rate" +ymin = min(min(all$dbsnp),min(t$truePositive)) +ymax = max(max(all$dbsnp),max(t$truePositive)) +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot histogram of the annotation's value +# + +outfile = paste(input, ".Histogram.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() + +# +# Plot histogram of the annotation's value, log scale on x-axis +# + +outfile = paste(input, ".Histogram_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R new file mode 100755 index 000000000..a79ddd3ab --- /dev/null +++ b/public/R/plot_Tranches.R @@ -0,0 +1,87 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +tranchesFile = args[1] +targetTITV = as.numeric(args[2]) +targetSensitivity = as.numeric(args[3]) +suppressLegend = ! is.na(args[4]) + +# ----------------------------------------------------------------------------------------------- +# Useful general routines +# ----------------------------------------------------------------------------------------------- + +MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate + +titvFPEst <- function(titvExpected, titvObserved) { + max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) +} + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +nTPFP <- function(nVariants, FDR) { + return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100))) +} + +leftShift <- function(x, leftValue = 0) { + r = rep(leftValue, length(x)) + for ( i in 1:(length(x)-1) ) { + #print(list(i=i)) + r[i] = x[i+1] + } + r +} + +# ----------------------------------------------------------------------------------------------- +# Tranches plot +# ----------------------------------------------------------------------------------------------- +data2 = read.table(tranchesFile,sep=",",head=T) +data2 = data2[order(data2$novelTiTv, decreasing=F),] +#data2 = data2[order(data2$FDRtranche, decreasing=T),] +cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") +density=c(20, -1, -1, 20) +outfile = paste(tranchesFile, ".pdf", sep="") +pdf(outfile, height=5, width=8) +par(mar = c(5, 5, 4, 2) + 0.1) +novelTiTv = c(data2$novelTITV,data2$novelTiTv) +alpha = 1 - titvFPEstV(targetTITV, novelTiTv) +#print(alpha) + +numGood = round(alpha * data2$numNovel); + +#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100)) +numBad = data2$numNovel - numGood; + +numPrevGood = leftShift(numGood, 0) +numNewGood = numGood - numPrevGood +numPrevBad = leftShift(numBad, 0) +numNewBad = numBad - numPrevBad + +d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE) +#print(d) +barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000)) +#abline(v= d[2,dim(d)[2]], lty=2) +#abline(v= d[1,3], lty=2) +if ( ! suppressLegend ) + legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) + +mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0) +axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) + +# plot sensitivity vs. specificity +sensitivity = data2$truthSensitivity +if ( ! is.null(sensitivity) ) { + #specificity = titvFPEstV(targetTITV, novelTiTv) + specificity = novelTiTv + plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)") + abline(h=targetTITV, lty=2) + abline(v=targetSensitivity, lty=2) + #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2) +} + +dev.off() diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R new file mode 100644 index 000000000..a1385ff3f --- /dev/null +++ b/public/R/plot_residualError_OtherCovariate.R @@ -0,0 +1,108 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +covariateName = args[2] + +outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +c <- c[sort.list(c[,1]),] + +# +# Plot residual error as a function of the covariate +# + +d.good <- c[c$nBases >= 1000,] +d.1000 <- c[c$nBases < 1000,] +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(d.good$nBases) == length(c$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)) +} +# Don't let residual error go off the edge of the plot +d.good$residualError = d.good$Qempirical-d.good$Qreported +d.good$residualError[which(d.good$residualError > 10)] = 10 +d.good$residualError[which(d.good$residualError < -10)] = -10 +d.1000$residualError = d.1000$Qempirical-d.1000$Qreported +d.1000$residualError[which(d.1000$residualError > 10)] = 10 +d.1000$residualError[which(d.1000$residualError < -10)] = -10 +c$residualError = c$Qempirical-c$Qreported +c$residualError[which(c$residualError > 10)] = 10 +c$residualError[which(c$residualError < -10)] = -10 +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") +} +dev.off() + + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + +# +# Plot histogram of the covariate +# + +e = d.good +f = d.1000 +outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) + +lwdSize=2 +if( length(c$Covariate) <= 20 ) { + lwdSize=7 +} else if( length(c$Covariate) <= 70 ) { + lwdSize=4 +} + +if( is.numeric(c$Covariate) ) { + if( length(hst$e.Covariate) == 0 ) { + plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + } else { + plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) + plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") + if( length(hst$c.Covariate) > 9 ) { + axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) + } else { + axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} +dev.off() diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R new file mode 100644 index 000000000..81bc9460d --- /dev/null +++ b/public/R/plot_residualError_QualityScoreCovariate.R @@ -0,0 +1,70 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) + +input = args[1] +Qcutoff = as.numeric(args[2]) +maxQ = as.numeric(args[3]) +maxHist = as.numeric(args[4]) + +t=read.table(input, header=T) + +# +# Plot of reported quality versus empirical quality +# + +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] +d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] +d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] +f <- t[t$Qreported < Qcutoff,] +e <- rbind(d.good, d.1000, d.10000) +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)); +} +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) +points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) +points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) +abline(0,1, lty=2) +dev.off() + +# +# Plot Q empirical histogram +# + +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) +percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) +entropy = -sum(log2(percentBases)*percentBases) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# + +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R new file mode 100755 index 000000000..7af5e8bbb --- /dev/null +++ b/public/R/titvFPEst.R @@ -0,0 +1,138 @@ +titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { + TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel + 2 * TP / 3 / callable +} + +marginalTiTv <- function( nx, titvx, ny, titvy ) { + tvx = nx / (titvx + 1) + tix = nx - tvx + tvy = ny / (titvy + 1) + tiy = ny - tvy + tiz = tix - tiy + tvz = tvx - tvy + return(tiz / tvz) +} +marginaldbSNPRate <- function( nx, dbx, ny, dby ) { + knownx = nx * dbx / 100 + novelx = nx - knownx + knowny = ny * dby / 100 + novely = ny - knowny + knownz = knownx - knowny + novelz = novelx - novely + return(knownz / ( knownz + novelz ) * 100) +} + +numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { + nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) + return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) +} + +normalize <- function(x) { + x / sum(x) +} + +normcumsum <- function(x) { + cumsum(normalize(x)) +} + +cumhist <- function(d, ...) { + plot(d[order(d)], type="b", col="orange", lwd=2, ...) +} + +revcumsum <- function(x) { + return(rev(cumsum(rev(x)))) +} + +phred <- function(x) { + log10(max(x,10^(-9.9)))*-10 +} + +pOfB <- function(b, B, Q) { + #print(paste(b, B, Q)) + p = 1 - 10^(-Q/10) + if ( b == B ) + return(p) + else + return(1 - p) +} + +pOfG <- function(bs, qs, G) { + a1 = G[1] + a2 = G[2] + + log10p = 0 + for ( i in 1:length(bs) ) { + b = bs[i] + q = qs[i] + p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 + log10p = log10p + log10(p1) + } + + return(log10p) +} + +pOfGs <- function(nAs, nBs, Q) { + bs = c(rep("a", nAs), rep("t", nBs)) + qs = rep(Q, nAs + nBs) + G1 = c("a", "a") + G2 = c("a", "t") + G3 = c("t", "t") + + log10p1 = pOfG(bs, qs, G1) + log10p2 = pOfG(bs, qs, G2) + log10p3 = pOfG(bs, qs, G3) + Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) + + return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) +} + +QsampleExpected <- function(depth, Q) { + weightedAvg = 0 + for ( d in 1:(depth*3) ) { + Qsample = 0 + pOfD = dpois(d, depth) + for ( nBs in 0:d ) { + pOfnB = dbinom(nBs, d, 0.5) + nAs = d - nBs + Qsample = pOfGs(nAs, nBs, Q)$Qsample + #Qsample = 1 + weightedAvg = weightedAvg + Qsample * pOfD * pOfnB + print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) + } + } + + return(weightedAvg) +} + +plotQsamples <- function(depths, Qs, Qmax) { + cols = rainbow(length(Qs)) + plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") + + for ( i in 1:length(Qs) ) { + Q = Qs[i] + y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) + points(depths, y, col=cols[i], type="b") + } + + legend("topleft", paste("Q", Qs), fill=cols) +} + +pCallHetGivenDepth <- function(depth, nallelesToCall) { + depths = 0:(2*depth) + pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) + dpois(depths,depth)*(1-pNoAllelesToCall) +} + +pCallHets <- function(depth, nallelesToCall) { + sum(pCallHetGivenDepth(depth,nallelesToCall)) +} + +pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { + 1-(1-pCallHets(depth,nallelesToCall))^nsamples +}