Moving some of the released R scripts into public from private

This commit is contained in:
Ryan Poplin 2011-06-30 14:55:25 -04:00
parent bb6b1d615b
commit f4ae6edb92
5 changed files with 593 additions and 0 deletions

View File

@ -0,0 +1,190 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
minBinCutoff = as.numeric(args[3])
medianNumVariants = args[4]
c <- read.table(input, header=T)
all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
#
# Calculate min, max, medians
#
d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv)
ymax = max(d$titv)
xmin = min(d$value)
xmax = max(d$value)
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
ma = all[all$value > m,]
mb = all[all$value < m,]
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
if(medianNumVariants == "true") {
vc = cumsum( all$numVariants/sum(all$numVariants) )
m10 = all$value[ max(which(vc<=0.10)) ]
m25 = all$value[ max(which(vc<=0.25)) ]
m = all$value[ max(which(vc<=0.5)) ]
m75 = all$value[ min(which(vc>=0.75)) ]
m90 = all$value[ min(which(vc>=0.90)) ]
}
#
# Plot TiTv ratio as a function of the annotation
#
outfile = paste(input, ".TiTv.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".TiTv_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation
#
ymin = min(all$dbsnp)
ymax = max(all$dbsnp)
outfile = paste(input, ".truthRate.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
t = all[all$truePositive>0,]
yLabel = "DBsnp/True Positive Rate"
ymin = min(min(all$dbsnp),min(t$truePositive))
ymax = max(max(all$dbsnp),max(t$truePositive))
}
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".truthRate_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot histogram of the annotation's value
#
outfile = paste(input, ".Histogram.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()
#
# Plot histogram of the annotation's value, log scale on x-axis
#
outfile = paste(input, ".Histogram_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()

View File

@ -0,0 +1,87 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
tranchesFile = args[1]
targetTITV = as.numeric(args[2])
targetSensitivity = as.numeric(args[3])
suppressLegend = ! is.na(args[4])
# -----------------------------------------------------------------------------------------------
# Useful general routines
# -----------------------------------------------------------------------------------------------
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
titvFPEst <- function(titvExpected, titvObserved) {
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
}
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
nTPFP <- function(nVariants, FDR) {
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
}
leftShift <- function(x, leftValue = 0) {
r = rep(leftValue, length(x))
for ( i in 1:(length(x)-1) ) {
#print(list(i=i))
r[i] = x[i+1]
}
r
}
# -----------------------------------------------------------------------------------------------
# Tranches plot
# -----------------------------------------------------------------------------------------------
data2 = read.table(tranchesFile,sep=",",head=T)
data2 = data2[order(data2$novelTiTv, decreasing=F),]
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
density=c(20, -1, -1, 20)
outfile = paste(tranchesFile, ".pdf", sep="")
pdf(outfile, height=5, width=8)
par(mar = c(5, 5, 4, 2) + 0.1)
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
#print(alpha)
numGood = round(alpha * data2$numNovel);
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
numBad = data2$numNovel - numGood;
numPrevGood = leftShift(numGood, 0)
numNewGood = numGood - numPrevGood
numPrevBad = leftShift(numBad, 0)
numNewBad = numBad - numPrevBad
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
#print(d)
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
#abline(v= d[2,dim(d)[2]], lty=2)
#abline(v= d[1,3], lty=2)
if ( ! suppressLegend )
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
# plot sensitivity vs. specificity
sensitivity = data2$truthSensitivity
if ( ! is.null(sensitivity) ) {
#specificity = titvFPEstV(targetTITV, novelTiTv)
specificity = novelTiTv
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
abline(h=targetTITV, lty=2)
abline(v=targetSensitivity, lty=2)
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
}
dev.off()

View File

@ -0,0 +1,108 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot residual error as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical-d.good$Qreported
d.good$residualError[which(d.good$residualError > 10)] = 10
d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
d.1000$residualError[which(d.1000$residualError > 10)] = 10
d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical-c$Qreported
c$residualError[which(c$residualError > 10)] = 10
c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -0,0 +1,70 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
maxHist = as.numeric(args[4])
t=read.table(input, header=T)
#
# Plot of reported quality versus empirical quality
#
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
f <- t[t$Qreported < Qcutoff,]
e <- rbind(d.good, d.1000, d.10000)
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
#
# Plot Q empirical histogram
#
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -0,0 +1,138 @@
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
2 * TP / 3 / callable
}
marginalTiTv <- function( nx, titvx, ny, titvy ) {
tvx = nx / (titvx + 1)
tix = nx - tvx
tvy = ny / (titvy + 1)
tiy = ny - tvy
tiz = tix - tiy
tvz = tvx - tvy
return(tiz / tvz)
}
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
knownx = nx * dbx / 100
novelx = nx - knownx
knowny = ny * dby / 100
novely = ny - knowny
knownz = knownx - knowny
novelz = novelx - novely
return(knownz / ( knownz + novelz ) * 100)
}
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
}
normalize <- function(x) {
x / sum(x)
}
normcumsum <- function(x) {
cumsum(normalize(x))
}
cumhist <- function(d, ...) {
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
}
revcumsum <- function(x) {
return(rev(cumsum(rev(x))))
}
phred <- function(x) {
log10(max(x,10^(-9.9)))*-10
}
pOfB <- function(b, B, Q) {
#print(paste(b, B, Q))
p = 1 - 10^(-Q/10)
if ( b == B )
return(p)
else
return(1 - p)
}
pOfG <- function(bs, qs, G) {
a1 = G[1]
a2 = G[2]
log10p = 0
for ( i in 1:length(bs) ) {
b = bs[i]
q = qs[i]
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
log10p = log10p + log10(p1)
}
return(log10p)
}
pOfGs <- function(nAs, nBs, Q) {
bs = c(rep("a", nAs), rep("t", nBs))
qs = rep(Q, nAs + nBs)
G1 = c("a", "a")
G2 = c("a", "t")
G3 = c("t", "t")
log10p1 = pOfG(bs, qs, G1)
log10p2 = pOfG(bs, qs, G2)
log10p3 = pOfG(bs, qs, G3)
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
}
QsampleExpected <- function(depth, Q) {
weightedAvg = 0
for ( d in 1:(depth*3) ) {
Qsample = 0
pOfD = dpois(d, depth)
for ( nBs in 0:d ) {
pOfnB = dbinom(nBs, d, 0.5)
nAs = d - nBs
Qsample = pOfGs(nAs, nBs, Q)$Qsample
#Qsample = 1
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
}
}
return(weightedAvg)
}
plotQsamples <- function(depths, Qs, Qmax) {
cols = rainbow(length(Qs))
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
for ( i in 1:length(Qs) ) {
Q = Qs[i]
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
points(depths, y, col=cols[i], type="b")
}
legend("topleft", paste("Q", Qs), fill=cols)
}
pCallHetGivenDepth <- function(depth, nallelesToCall) {
depths = 0:(2*depth)
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
dpois(depths,depth)*(1-pNoAllelesToCall)
}
pCallHets <- function(depth, nallelesToCall) {
sum(pCallHetGivenDepth(depth,nallelesToCall))
}
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
1-(1-pCallHets(depth,nallelesToCall))^nsamples
}