Moving some of the released R scripts into public from private
This commit is contained in:
parent
bb6b1d615b
commit
f4ae6edb92
|
|
@ -0,0 +1,190 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
annotationName = args[2]
|
||||
minBinCutoff = as.numeric(args[3])
|
||||
medianNumVariants = args[4]
|
||||
|
||||
c <- read.table(input, header=T)
|
||||
|
||||
all = c[c$numVariants>minBinCutoff & c$category=="all",]
|
||||
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
|
||||
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
|
||||
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
|
||||
|
||||
#
|
||||
# Calculate min, max, medians
|
||||
#
|
||||
|
||||
d = c[c$numVariants>minBinCutoff,]
|
||||
ymin = min(d$titv)
|
||||
ymax = max(d$titv)
|
||||
xmin = min(d$value)
|
||||
xmax = max(d$value)
|
||||
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
|
||||
ma = all[all$value > m,]
|
||||
mb = all[all$value < m,]
|
||||
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
|
||||
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
|
||||
if(medianNumVariants == "true") {
|
||||
vc = cumsum( all$numVariants/sum(all$numVariants) )
|
||||
m10 = all$value[ max(which(vc<=0.10)) ]
|
||||
m25 = all$value[ max(which(vc<=0.25)) ]
|
||||
m = all$value[ max(which(vc<=0.5)) ]
|
||||
m75 = all$value[ min(which(vc>=0.75)) ]
|
||||
m90 = all$value[ min(which(vc>=0.90)) ]
|
||||
}
|
||||
|
||||
#
|
||||
# Plot TiTv ratio as a function of the annotation
|
||||
#
|
||||
|
||||
outfile = paste(input, ".TiTv.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
points(novel$value,novel$titv,col="green",pch=20)
|
||||
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(truth$value,truth$titv,col="magenta",pch=20)
|
||||
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
|
||||
} else {
|
||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".TiTv_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
points(novel$value,novel$titv,col="green",pch=20)
|
||||
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(truth$value,truth$titv,col="magenta",pch=20)
|
||||
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
|
||||
} else {
|
||||
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot dbsnp and true positive rate as a function of the annotation
|
||||
#
|
||||
|
||||
ymin = min(all$dbsnp)
|
||||
ymax = max(all$dbsnp)
|
||||
outfile = paste(input, ".truthRate.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
yLabel = "DBsnp Rate"
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
t = all[all$truePositive>0,]
|
||||
yLabel = "DBsnp/True Positive Rate"
|
||||
ymin = min(min(all$dbsnp),min(t$truePositive))
|
||||
ymax = max(max(all$dbsnp),max(t$truePositive))
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".truthRate_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
yLabel = "DBsnp Rate"
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
yLabel = "DBsnp/Truth Rate"
|
||||
}
|
||||
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
abline(v=m,lty=2,col="red")
|
||||
abline(v=m75,lty=3)
|
||||
abline(v=m25,lty=3)
|
||||
text(m, ymin, "50", col="red", cex=0.6);
|
||||
text(m75, ymin, "75", col="black", cex=0.6);
|
||||
text(m25, ymin, "25", col="black", cex=0.6);
|
||||
if(medianNumVariants == "true") {
|
||||
abline(v=m90,lty=3)
|
||||
abline(v=m10,lty=3)
|
||||
text(m10, ymin, "10", col="black", cex=0.6);
|
||||
text(m90, ymin, "90", col="black", cex=0.6);
|
||||
}
|
||||
if( sum(all$truePositive==0) != length(all$truePositive) ) {
|
||||
points(t$value,t$truePositive,col="magenta",pch=20);
|
||||
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the annotation's value
|
||||
#
|
||||
|
||||
outfile = paste(input, ".Histogram.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the annotation's value, log scale on x-axis
|
||||
#
|
||||
|
||||
outfile = paste(input, ".Histogram_log.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
|
||||
axis(1,axTicks(1), format(axTicks(1), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
tranchesFile = args[1]
|
||||
targetTITV = as.numeric(args[2])
|
||||
targetSensitivity = as.numeric(args[3])
|
||||
suppressLegend = ! is.na(args[4])
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Useful general routines
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
|
||||
|
||||
titvFPEst <- function(titvExpected, titvObserved) {
|
||||
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
|
||||
}
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
nTPFP <- function(nVariants, FDR) {
|
||||
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
|
||||
}
|
||||
|
||||
leftShift <- function(x, leftValue = 0) {
|
||||
r = rep(leftValue, length(x))
|
||||
for ( i in 1:(length(x)-1) ) {
|
||||
#print(list(i=i))
|
||||
r[i] = x[i+1]
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Tranches plot
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
data2 = read.table(tranchesFile,sep=",",head=T)
|
||||
data2 = data2[order(data2$novelTiTv, decreasing=F),]
|
||||
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
|
||||
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
|
||||
density=c(20, -1, -1, 20)
|
||||
outfile = paste(tranchesFile, ".pdf", sep="")
|
||||
pdf(outfile, height=5, width=8)
|
||||
par(mar = c(5, 5, 4, 2) + 0.1)
|
||||
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
|
||||
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
|
||||
#print(alpha)
|
||||
|
||||
numGood = round(alpha * data2$numNovel);
|
||||
|
||||
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
|
||||
numBad = data2$numNovel - numGood;
|
||||
|
||||
numPrevGood = leftShift(numGood, 0)
|
||||
numNewGood = numGood - numPrevGood
|
||||
numPrevBad = leftShift(numBad, 0)
|
||||
numNewBad = numBad - numPrevBad
|
||||
|
||||
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
|
||||
#print(d)
|
||||
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
|
||||
#abline(v= d[2,dim(d)[2]], lty=2)
|
||||
#abline(v= d[1,3], lty=2)
|
||||
if ( ! suppressLegend )
|
||||
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
|
||||
|
||||
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
|
||||
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
|
||||
|
||||
# plot sensitivity vs. specificity
|
||||
sensitivity = data2$truthSensitivity
|
||||
if ( ! is.null(sensitivity) ) {
|
||||
#specificity = titvFPEstV(targetTITV, novelTiTv)
|
||||
specificity = novelTiTv
|
||||
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
|
||||
abline(h=targetTITV, lty=2)
|
||||
abline(v=targetSensitivity, lty=2)
|
||||
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
|
||||
d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical-c$Qreported
|
||||
c$residualError[which(c$residualError > 10)] = 10
|
||||
c$residualError[which(c$residualError < -10)] = -10
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
|
||||
lwdSize=2
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
lwdSize=7
|
||||
} else if( length(c$Covariate) <= 70 ) {
|
||||
lwdSize=4
|
||||
}
|
||||
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
if( length(hst$e.Covariate) == 0 ) {
|
||||
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
} else {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
|
||||
if( length(hst$c.Covariate) > 9 ) {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
} else {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
Qcutoff = as.numeric(args[2])
|
||||
maxQ = as.numeric(args[3])
|
||||
maxHist = as.numeric(args[4])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
|
||||
#
|
||||
# Plot of reported quality versus empirical quality
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
||||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q empirical histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
|
||||
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
|
||||
entropy = -sum(log2(percentBases)*percentBases)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
|
||||
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
|
||||
2 * TP / 3 / callable
|
||||
}
|
||||
|
||||
marginalTiTv <- function( nx, titvx, ny, titvy ) {
|
||||
tvx = nx / (titvx + 1)
|
||||
tix = nx - tvx
|
||||
tvy = ny / (titvy + 1)
|
||||
tiy = ny - tvy
|
||||
tiz = tix - tiy
|
||||
tvz = tvx - tvy
|
||||
return(tiz / tvz)
|
||||
}
|
||||
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
|
||||
knownx = nx * dbx / 100
|
||||
novelx = nx - knownx
|
||||
knowny = ny * dby / 100
|
||||
novely = ny - knowny
|
||||
knownz = knownx - knowny
|
||||
novelz = novelx - novely
|
||||
return(knownz / ( knownz + novelz ) * 100)
|
||||
}
|
||||
|
||||
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
|
||||
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
|
||||
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
|
||||
}
|
||||
|
||||
normalize <- function(x) {
|
||||
x / sum(x)
|
||||
}
|
||||
|
||||
normcumsum <- function(x) {
|
||||
cumsum(normalize(x))
|
||||
}
|
||||
|
||||
cumhist <- function(d, ...) {
|
||||
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
|
||||
}
|
||||
|
||||
revcumsum <- function(x) {
|
||||
return(rev(cumsum(rev(x))))
|
||||
}
|
||||
|
||||
phred <- function(x) {
|
||||
log10(max(x,10^(-9.9)))*-10
|
||||
}
|
||||
|
||||
pOfB <- function(b, B, Q) {
|
||||
#print(paste(b, B, Q))
|
||||
p = 1 - 10^(-Q/10)
|
||||
if ( b == B )
|
||||
return(p)
|
||||
else
|
||||
return(1 - p)
|
||||
}
|
||||
|
||||
pOfG <- function(bs, qs, G) {
|
||||
a1 = G[1]
|
||||
a2 = G[2]
|
||||
|
||||
log10p = 0
|
||||
for ( i in 1:length(bs) ) {
|
||||
b = bs[i]
|
||||
q = qs[i]
|
||||
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
|
||||
log10p = log10p + log10(p1)
|
||||
}
|
||||
|
||||
return(log10p)
|
||||
}
|
||||
|
||||
pOfGs <- function(nAs, nBs, Q) {
|
||||
bs = c(rep("a", nAs), rep("t", nBs))
|
||||
qs = rep(Q, nAs + nBs)
|
||||
G1 = c("a", "a")
|
||||
G2 = c("a", "t")
|
||||
G3 = c("t", "t")
|
||||
|
||||
log10p1 = pOfG(bs, qs, G1)
|
||||
log10p2 = pOfG(bs, qs, G2)
|
||||
log10p3 = pOfG(bs, qs, G3)
|
||||
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
|
||||
|
||||
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
|
||||
}
|
||||
|
||||
QsampleExpected <- function(depth, Q) {
|
||||
weightedAvg = 0
|
||||
for ( d in 1:(depth*3) ) {
|
||||
Qsample = 0
|
||||
pOfD = dpois(d, depth)
|
||||
for ( nBs in 0:d ) {
|
||||
pOfnB = dbinom(nBs, d, 0.5)
|
||||
nAs = d - nBs
|
||||
Qsample = pOfGs(nAs, nBs, Q)$Qsample
|
||||
#Qsample = 1
|
||||
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
|
||||
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
|
||||
}
|
||||
}
|
||||
|
||||
return(weightedAvg)
|
||||
}
|
||||
|
||||
plotQsamples <- function(depths, Qs, Qmax) {
|
||||
cols = rainbow(length(Qs))
|
||||
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
|
||||
|
||||
for ( i in 1:length(Qs) ) {
|
||||
Q = Qs[i]
|
||||
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
|
||||
points(depths, y, col=cols[i], type="b")
|
||||
}
|
||||
|
||||
legend("topleft", paste("Q", Qs), fill=cols)
|
||||
}
|
||||
|
||||
pCallHetGivenDepth <- function(depth, nallelesToCall) {
|
||||
depths = 0:(2*depth)
|
||||
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
|
||||
dpois(depths,depth)*(1-pNoAllelesToCall)
|
||||
}
|
||||
|
||||
pCallHets <- function(depth, nallelesToCall) {
|
||||
sum(pCallHetGivenDepth(depth,nallelesToCall))
|
||||
}
|
||||
|
||||
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
|
||||
1-(1-pCallHets(depth,nallelesToCall))^nsamples
|
||||
}
|
||||
Loading…
Reference in New Issue