Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Matt Hanna 2011-06-30 18:15:53 -04:00
commit 91f8ab115d
33 changed files with 1121 additions and 860 deletions

View File

@ -0,0 +1,190 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
annotationName = args[2]
minBinCutoff = as.numeric(args[3])
medianNumVariants = args[4]
c <- read.table(input, header=T)
all = c[c$numVariants>minBinCutoff & c$category=="all",]
novel = c[c$numVariants>minBinCutoff & c$category=="novel",]
dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",]
truth = c[c$numVariants>minBinCutoff & c$category=="truth",]
#
# Calculate min, max, medians
#
d = c[c$numVariants>minBinCutoff,]
ymin = min(d$titv)
ymax = max(d$titv)
xmin = min(d$value)
xmax = max(d$value)
m = weighted.mean(all$value,all$numVariants/sum(all$numVariants))
ma = all[all$value > m,]
mb = all[all$value < m,]
m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants))
m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants))
if(medianNumVariants == "true") {
vc = cumsum( all$numVariants/sum(all$numVariants) )
m10 = all$value[ max(which(vc<=0.10)) ]
m25 = all$value[ max(which(vc<=0.25)) ]
m = all$value[ max(which(vc<=0.5)) ]
m75 = all$value[ min(which(vc>=0.75)) ]
m90 = all$value[ min(which(vc>=0.90)) ]
}
#
# Plot TiTv ratio as a function of the annotation
#
outfile = paste(input, ".TiTv.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot TiTv ratio as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".TiTv_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
points(novel$value,novel$titv,col="green",pch=20)
points(dbsnp$value,dbsnp$titv,col="blue",pch=20)
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(truth$value,truth$titv,col="magenta",pch=20)
legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20))
} else {
legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation
#
ymin = min(all$dbsnp)
ymax = max(all$dbsnp)
outfile = paste(input, ".truthRate.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
t = all[all$truePositive>0,]
yLabel = "DBsnp/True Positive Rate"
ymin = min(min(all$dbsnp),min(t$truePositive))
ymax = max(max(all$dbsnp),max(t$truePositive))
}
plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis
#
outfile = paste(input, ".truthRate_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
yLabel = "DBsnp Rate"
if( sum(all$truePositive==0) != length(all$truePositive) ) {
yLabel = "DBsnp/Truth Rate"
}
plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
abline(v=m,lty=2,col="red")
abline(v=m75,lty=3)
abline(v=m25,lty=3)
text(m, ymin, "50", col="red", cex=0.6);
text(m75, ymin, "75", col="black", cex=0.6);
text(m25, ymin, "25", col="black", cex=0.6);
if(medianNumVariants == "true") {
abline(v=m90,lty=3)
abline(v=m10,lty=3)
text(m10, ymin, "10", col="black", cex=0.6);
text(m90, ymin, "90", col="black", cex=0.6);
}
if( sum(all$truePositive==0) != length(all$truePositive) ) {
points(t$value,t$truePositive,col="magenta",pch=20);
legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20))
}
dev.off()
#
# Plot histogram of the annotation's value
#
outfile = paste(input, ".Histogram.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()
#
# Plot histogram of the annotation's value, log scale on x-axis
#
outfile = paste(input, ".Histogram_log.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4);
axis(1,axTicks(1), format(axTicks(1), scientific=F))
dev.off()

View File

@ -0,0 +1,87 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
tranchesFile = args[1]
targetTITV = as.numeric(args[2])
targetSensitivity = as.numeric(args[3])
suppressLegend = ! is.na(args[4])
# -----------------------------------------------------------------------------------------------
# Useful general routines
# -----------------------------------------------------------------------------------------------
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
titvFPEst <- function(titvExpected, titvObserved) {
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
}
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
nTPFP <- function(nVariants, FDR) {
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
}
leftShift <- function(x, leftValue = 0) {
r = rep(leftValue, length(x))
for ( i in 1:(length(x)-1) ) {
#print(list(i=i))
r[i] = x[i+1]
}
r
}
# -----------------------------------------------------------------------------------------------
# Tranches plot
# -----------------------------------------------------------------------------------------------
data2 = read.table(tranchesFile,sep=",",head=T)
data2 = data2[order(data2$novelTiTv, decreasing=F),]
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
density=c(20, -1, -1, 20)
outfile = paste(tranchesFile, ".pdf", sep="")
pdf(outfile, height=5, width=8)
par(mar = c(5, 5, 4, 2) + 0.1)
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
#print(alpha)
numGood = round(alpha * data2$numNovel);
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
numBad = data2$numNovel - numGood;
numPrevGood = leftShift(numGood, 0)
numNewGood = numGood - numPrevGood
numPrevBad = leftShift(numBad, 0)
numNewBad = numBad - numPrevBad
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
#print(d)
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
#abline(v= d[2,dim(d)[2]], lty=2)
#abline(v= d[1,3], lty=2)
if ( ! suppressLegend )
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
# plot sensitivity vs. specificity
sensitivity = data2$truthSensitivity
if ( ! is.null(sensitivity) ) {
#specificity = titvFPEstV(targetTITV, novelTiTv)
specificity = novelTiTv
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
abline(h=targetTITV, lty=2)
abline(v=targetSensitivity, lty=2)
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
}
dev.off()

View File

@ -0,0 +1,108 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot residual error as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical-d.good$Qreported
d.good$residualError[which(d.good$residualError > 10)] = 10
d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
d.1000$residualError[which(d.1000$residualError > 10)] = 10
d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical-c$Qreported
c$residualError[which(c$residualError > 10)] = 10
c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -0,0 +1,70 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
maxHist = as.numeric(args[4])
t=read.table(input, header=T)
#
# Plot of reported quality versus empirical quality
#
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
f <- t[t$Qreported < Qcutoff,]
e <- rbind(d.good, d.1000, d.10000)
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
#
# Plot Q empirical histogram
#
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -0,0 +1,138 @@
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
2 * TP / 3 / callable
}
marginalTiTv <- function( nx, titvx, ny, titvy ) {
tvx = nx / (titvx + 1)
tix = nx - tvx
tvy = ny / (titvy + 1)
tiy = ny - tvy
tiz = tix - tiy
tvz = tvx - tvy
return(tiz / tvz)
}
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
knownx = nx * dbx / 100
novelx = nx - knownx
knowny = ny * dby / 100
novely = ny - knowny
knownz = knownx - knowny
novelz = novelx - novely
return(knownz / ( knownz + novelz ) * 100)
}
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
}
normalize <- function(x) {
x / sum(x)
}
normcumsum <- function(x) {
cumsum(normalize(x))
}
cumhist <- function(d, ...) {
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
}
revcumsum <- function(x) {
return(rev(cumsum(rev(x))))
}
phred <- function(x) {
log10(max(x,10^(-9.9)))*-10
}
pOfB <- function(b, B, Q) {
#print(paste(b, B, Q))
p = 1 - 10^(-Q/10)
if ( b == B )
return(p)
else
return(1 - p)
}
pOfG <- function(bs, qs, G) {
a1 = G[1]
a2 = G[2]
log10p = 0
for ( i in 1:length(bs) ) {
b = bs[i]
q = qs[i]
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
log10p = log10p + log10(p1)
}
return(log10p)
}
pOfGs <- function(nAs, nBs, Q) {
bs = c(rep("a", nAs), rep("t", nBs))
qs = rep(Q, nAs + nBs)
G1 = c("a", "a")
G2 = c("a", "t")
G3 = c("t", "t")
log10p1 = pOfG(bs, qs, G1)
log10p2 = pOfG(bs, qs, G2)
log10p3 = pOfG(bs, qs, G3)
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
}
QsampleExpected <- function(depth, Q) {
weightedAvg = 0
for ( d in 1:(depth*3) ) {
Qsample = 0
pOfD = dpois(d, depth)
for ( nBs in 0:d ) {
pOfnB = dbinom(nBs, d, 0.5)
nAs = d - nBs
Qsample = pOfGs(nAs, nBs, Q)$Qsample
#Qsample = 1
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
}
}
return(weightedAvg)
}
plotQsamples <- function(depths, Qs, Qmax) {
cols = rainbow(length(Qs))
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
for ( i in 1:length(Qs) ) {
Q = Qs[i]
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
points(depths, y, col=cols[i], type="b")
}
legend("topleft", paste("Q", Qs), fill=cols)
}
pCallHetGivenDepth <- function(depth, nallelesToCall) {
depths = 0:(2*depth)
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
dpois(depths,depth)*(1-pNoAllelesToCall)
}
pCallHets <- function(depth, nallelesToCall) {
sum(pCallHetGivenDepth(depth,nallelesToCall))
}
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
1-(1-pCallHets(depth,nallelesToCall))^nsamples
}

View File

@ -798,11 +798,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
private void generateAlternateConsensesFromKnownIndels(final Set<Consensus> altConsensesToPopulate, final int leftmostIndex, final byte[] reference) {
for ( VariantContext knownIndel : knownIndelsToTry ) {
if ( knownIndel == null || !knownIndel.isIndel() )
if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() )
continue;
byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
int start = knownIndel.getStart() - leftmostIndex + 1;
Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion());
Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel);
if ( c != null )
altConsensesToPopulate.add(c);
}
@ -988,7 +988,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
// create a Consensus from just the indel string that falls on the reference
private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final boolean isDeletion) {
private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) {
if ( indexOnRef < 0 || indexOnRef >= reference.length )
return null;
@ -1002,14 +1002,16 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( indexOnRef > 0 )
cigar.add(new CigarElement(indexOnRef, CigarOperator.M));
if ( isDeletion ) {
if ( indel.isDeletion() ) {
refIdx += indelStr.length;
cigar.add(new CigarElement(indelStr.length, CigarOperator.D));
}
else {
else if ( indel.isInsertion() ) {
for ( byte b : indelStr )
sb.append((char)b);
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
} else {
throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows");
}
if ( reference.length - refIdx > 0 )

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.commandline.Argument;
import java.io.File;
import java.util.*;
@By(DataSource.READS)
// walker to count realigned reads
public class RealignedReadCounter extends ReadWalker<Integer, Integer> {
public static final String ORIGINAL_CIGAR_TAG = "OC";
public static final String ORIGINAL_POSITION_TAG = "OP";
@Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
protected String intervalsFile = null;
// the intervals input by the user
private Iterator<GenomeLoc> intervals = null;
// the current interval in the list
private GenomeLoc currentInterval = null;
private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
private boolean intervalWasUpdated = false;
public void initialize() {
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
currentInterval = intervals.hasNext() ? intervals.next() : null;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( currentInterval == null ) {
return 0;
}
GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
// hack to get around unmapped reads having screwy locations
if ( readLoc.getStop() == 0 )
readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
return 0;
if ( readLoc.overlapsP(currentInterval) ) {
if ( doNotTryToClean(read) )
return 0;
if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
// deal with an old bug
if ( read.getCigar().toString().equals(newCigar) ) {
//System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
return 0;
}
if ( !intervalWasUpdated ) {
intervalWasUpdated = true;
updatedIntervals++;
affectedBases += 20 + getIndelSize(read);
}
updatedReads++;
}
} else {
do {
intervalWasUpdated = false;
currentInterval = intervals.hasNext() ? intervals.next() : null;
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
}
return 0;
}
private int getIndelSize(SAMRecord read) {
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.I )
return 0;
if ( ce.getOperator() == CigarOperator.D )
return ce.getLength();
}
logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
return 0;
}
private boolean doNotTryToClean(SAMRecord read) {
return read.getReadUnmappedFlag() ||
read.getNotPrimaryAlignmentFlag() ||
read.getReadFailsVendorQualityCheckFlag() ||
read.getMappingQuality() == 0 ||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
(BadMateFilter.hasBadMate(read));
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
System.out.println(updatedIntervals + " intervals were updated");
System.out.println(updatedReads + " reads were updated");
System.out.println(affectedBases + " bases were affected");
}
}

View File

@ -161,7 +161,7 @@ public class AnnotateMNPsWalker extends RodWalker<Integer, Integer> {
boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart();
boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop();
if (vc.getType() == VariantContext.Type.MNP) {
if (vc.isMNP()) {
logger.debug("Observed MNP at " + vcLoc);
if (isChrM(vc)) {

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.List;
import java.io.PrintStream;
/**
* Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
* very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
* This was its very first use.
*/
public class CountIntervals extends RefWalker<Long, Long> {
@Output
PrintStream out;
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
int numOverlaps = 2;
public Long reduceInit() {
return 0l;
}
public boolean isReduceByInterval() { return true; }
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) {
return null;
}
List<GATKFeature> checkIntervals = tracker.getGATKFeatureMetaData("check",false);
return (long) checkIntervals.size();
}
public Long reduce(Long loc, Long prev) {
if ( loc == null ) {
return 0l;
} else {
return Math.max(prev,loc);
}
}
public void onTraversalDone(List<Pair<GenomeLoc,Long>> finalReduce) {
long count = 0;
for ( Pair<GenomeLoc,Long> g : finalReduce ) {
if ( g.second >= numOverlaps) {
count ++;
}
}
out.printf("Number of contiguous intervals: %d",count);
}
}

View File

@ -1,157 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SimpleTimer;
import java.io.PrintStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.*;
/**
* Emits specific fields as dictated by the user from one or more VCF files.
*/
@Requires(value={})
public class ProfileRodSystem extends RodWalker<Integer, Integer> {
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
int nIterations = 1;
@Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
int MAX_RECORDS = -1;
SimpleTimer timer = new SimpleTimer("myTimer");
public void initialize() {
File rodFile = getRodFile();
out.printf("# walltime is in seconds%n");
out.printf("# file is %s%n", rodFile);
out.printf("# file size is %d bytes%n", rodFile.length());
out.printf("operation\titeration\twalltime%n");
for ( int i = 0; i < nIterations; i++ ) {
out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
}
timer.start(); // start up timer for map itself
}
private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
private final double readFile(File f, ReadMode mode) {
timer.start();
try {
byte[] data = new byte[100000];
FileInputStream s = new FileInputStream(f);
if ( mode == ReadMode.BY_BYTE ) {
while (true) {
if ( s.read(data) == -1 )
break;
}
} else {
int counter = 0;
VCFCodec codec = new VCFCodec();
String[] parts = new String[100000];
AsciiLineReader lineReader = new AsciiLineReader(s);
if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
codec.readHeader(lineReader);
while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
String line = lineReader.readLine();
if ( line == null )
break;
else if ( mode == ReadMode.BY_PARTS ) {
ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
}
else if ( mode == ReadMode.DECODE_LOC ) {
codec.decodeLoc(line);
}
else if ( mode == ReadMode.DECODE ) {
processOneVC((VariantContext)codec.decode(line));
}
}
}
} catch ( Exception e ) {
throw new RuntimeException(e);
}
return timer.getElapsedTime();
}
private File getRodFile() {
List<ReferenceOrderedDataSource> rods = this.getToolkit().getRodDataSources();
ReferenceOrderedDataSource rod = rods.get(0);
return rod.getFile();
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
processOneVC(vc);
return 0;
}
private static final void processOneVC(VariantContext vc) {
vc.getNSamples(); // force us to parse the samples
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer counter, Integer sum) {
return counter + sum;
}
public void onTraversalDone(Integer sum) {
out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
}
}

View File

@ -0,0 +1,153 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collection;
import java.util.List;
/**
* a walker for validating (in the style of validating pile-up) the ROD system.
*/
@Reference(window=@Window(start=-40,stop=40))
public class RodSystemValidationWalker extends RodWalker<Integer,Integer> {
// the divider to use in some of the text output
private static final String DIVIDER = ",";
@Output
public PrintStream out;
@Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
public boolean allRecordsVariantContextEquivalent = false;
// used to calculate the MD5 of a file
MessageDigest digest = null;
// we sometimes need to know what rods the engine's seen
List<ReferenceOrderedDataSource> rodList;
/**
* emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
* underneath us).
*/
public void initialize() {
// setup the MD5-er
try {
digest = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new ReviewedStingException("Unable to find MD5 checksumer");
}
out.println("Header:");
// enumerate the list of ROD's we've loaded
rodList = this.getToolkit().getRodDataSources();
for (ReferenceOrderedDataSource rod : rodList) {
out.println(rod.getName() + DIVIDER + rod.getType());
out.println(rod.getName() + DIVIDER + rod.getFile());
out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
}
out.println("Data:");
}
/**
*
* @param tracker the ref meta data tracker to get RODs
* @param ref reference context
* @param context the reads
* @return an 1 for each site with a rod(s), 0 otherwise
*/
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
int ret = 0;
if (tracker != null && tracker.getAllRods().size() > 0) {
out.print(context.getLocation() + DIVIDER);
Collection<GATKFeature> features = tracker.getAllRods();
for (GATKFeature feat : features)
out.print(feat.getName() + DIVIDER);
out.println(";");
ret++;
}
// if the argument was set, check for equivalence
if (allRecordsVariantContextEquivalent && tracker != null) {
Collection<VariantContext> col = tracker.getAllVariantContexts(ref);
VariantContext con = null;
for (VariantContext contextInList : col)
if (con == null) con = contextInList;
else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
}
return ret;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
@Override
public Integer reduceInit() {
return 0;
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
@Override
public void onTraversalDone(Integer result) {
// Double check traversal result to make count is the same.
// TODO: Is this check necessary?
out.println("[REDUCE RESULT] Traversal result is: " + result);
}
// shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
private String md5sum(File f) {
InputStream is;
try {
is = new FileInputStream(f);
} catch (FileNotFoundException e) {
return "Not a file";
}
byte[] buffer = new byte[8192];
int read = 0;
try {
while ((read = is.read(buffer)) > 0) {
digest.update(buffer, 0, read);
}
byte[] md5sum = digest.digest();
BigInteger bigInt = new BigInteger(1, md5sum);
return bigInt.toString(16);
}
catch (IOException e) {
throw new RuntimeException("Unable to process file for MD5", e);
}
finally {
try {
is.close();
}
catch (IOException e) {
throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
}
}
}
}

View File

@ -1,298 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.Argument;
import java.io.PrintStream;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
*/
@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
@Reference(window=@Window(start=-5,stop=5))
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
protected int maxReadLen = 1000;
@Argument(doc="",required=false)
protected int bw = 7;
@Argument(doc="",required=false)
protected boolean samtoolsMode = false;
@Argument(doc="only operates on reads with this name",required=false)
protected String readName = null;
@Argument(doc="If true, all differences are errors", required=false)
protected boolean strict = false;
@Argument(doc="prints info for each read", required=false)
protected boolean printEachRead = false;
@Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
protected boolean alsoPrintWarnings = false;
@Argument(doc="Include reads without BAQ tag", required=false)
protected boolean includeReadsWithoutBAQTag = false;
@Argument(doc="x each read is processed", required=false)
protected int magnification = 1;
@Argument(doc="Profile performance", required=false)
protected boolean profile = false;
int counter = 0;
BAQ baqHMM = null; // matches current samtools parameters
public void initialize() {
if ( samtoolsMode )
baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
else
baqHMM = new BAQ();
}
long goodReads = 0, badReads = 0;
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
if ( baqHMM.excludeReadFromBAQ(read) )
return 0;
if ( profile ) {
profileBAQ(ref, read);
} else {
validateBAQ(ref, read);
}
return 1;
}
return 0;
}
SimpleTimer tagTimer = new SimpleTimer("from.tag");
SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
private void profileBAQ(ReferenceContext ref, SAMRecord read) {
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
BAQ.BAQCalculationResult baq = null;
tagTimer.restart();
for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
tagTimer.stop();
baqReadTimer.restart();
for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
baqReadTimer.stop();
glocalTimer.restart();
for ( int i = 0; i < magnification; i++ )
baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
glocalTimer.stop();
}
private void validateBAQ(ReferenceContext ref, SAMRecord read) {
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
boolean fail = false;
boolean print = false;
int badi = 0;
if ( BAQ.hasBAQTag(read) ) {
for ( badi = 0; badi < baqFromTag.length; badi++ ) {
if ( baqFromTag[badi] != baq.bq[badi] ) {
if ( cigarLength(read) != read.getReadLength() ) {
print = true;
fail = false;
out.printf(" different, but cigar length != read length%n");
break;
}
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
print = true;
fail = strict;
out.printf(" different, but Q0 base detected%n");
break;
}
else if (readHasSoftClip(read) && ! samtoolsMode) {
print = true;
fail = strict;
out.printf(" different, but soft clip detected%n");
break;
} else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
print = true;
fail = strict;
out.printf(" different, but deletion detected%n");
break;
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
print = fail = true;
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
break;
} else {
print = fail = true;
break;
}
}
}
if ( fail || print )
badReads++;
else
goodReads++;
}
if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
byte[] pos = new byte[baq.bq.length];
for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
out.printf(" read length : %d%n", read.getReadLength());
out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
out.printf(" cigar : %s%n", read.getCigarString());
out.printf(" ref bases : %s%n", new String(baq.refBases));
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.printf(" ref length : %d%n", baq.refBases.length);
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true);
printQuals(" original quals: ", read.getBaseQualities(), true);
printQuals(" baq quals: ", baq.bq, true);
printQuals(" positions : ", pos, true);
printQuals(" original quals: ", read.getBaseQualities());
if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
printQuals(" hmm quals: ", baq.bq);
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.println(Utils.dupString('-', 80));
}
if ( fail )
throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
}
private final static boolean readHasSoftClip(SAMRecord read) {
for (CigarElement e : read.getCigar().getCigarElements()) {
if ( e.getOperator() == CigarOperator.SOFT_CLIP )
return true;
}
return false;
}
private final static boolean readHasDeletion(SAMRecord read) {
for (CigarElement e : read.getCigar().getCigarElements()) {
if ( e.getOperator() == CigarOperator.DELETION )
return true;
}
return false;
}
public final void printQuals( String prefix, byte[] quals ) {
printQuals(prefix, quals, false);
}
public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
printQuals(out, prefix, quals, asInt);
}
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
out.print(prefix);
for ( int i = 0; i < quals.length; i++) {
if ( asInt ) {
out.printf("%2d", (int)quals[i]);
if ( i+1 != quals.length ) out.print(",");
} else
out.print((char)(quals[i]+33));
}
out.println();
}
/**
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
* @param read
* @return
*/
public static byte[] getBAQDeltas(SAMRecord read) {
byte[] baq = BAQ.getBAQTag(read);
if ( baq != null ) {
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
}
private int cigarLength(SAMRecord read) {
int readI = 0;
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
int l = elt.getLength();
switch (elt.getOperator()) {
case N: // cannot handle these
return 0;
case H : case P : // ignore pads and hard clips
break;
case S :
case I :
readI += l;
break;
case D : break;
case M :
readI += l;
break;
default:
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
}
}
return readI;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer nreads) {
if ( profile ) {
out.printf("n.reads baq.per.read calculation time.in.secs%n");
printTimer(nreads, tagTimer);
printTimer(nreads, glocalTimer);
printTimer(nreads, baqReadTimer);
} else {
out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
goodReads, (100.0 * goodReads) / nreads,
badReads, (100.0 * badReads) / nreads);
}
}
private final void printTimer(int nreads, SimpleTimer timer) {
out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
}
}

View File

@ -209,7 +209,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
String assay_sequence;
if ( vc.isSNP() )
assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;
else if ( vc.getType() == VariantContext.Type.MNP )
else if ( vc.isMNP() )
assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1);
else if ( vc.isInsertion() )
assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases;

View File

@ -113,7 +113,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
}
// deletions
else if ( vc.isDeletion() || vc.isMixed() || vc.getType() == VariantContext.Type.MNP ) {
else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) {
// we can't validate arbitrarily long deletions
if ( reportedRefAllele.length() > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));
@ -123,7 +123,7 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
// deletions are associated with the (position of) the last (preceding) non-deleted base;
// hence to get actually deleted bases we need offset = 1
int offset = 1 ;
if ( vc.getType() == VariantContext.Type.MNP ) offset = 0; // if it's an MNP, the reported position IS the first modified base
if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base
byte[] refBytes = ref.getBases();
byte[] trueRef = new byte[reportedRefAllele.length()];
for (int i = 0; i < reportedRefAllele.length(); i++)

View File

@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @return vc subcontext
*/
public VariantContext subContextFromGenotypes(Collection<Genotype> genotypes, Set<Allele> alleles) {
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes());
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes());
}
@ -566,10 +566,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati
return getType() == Type.INDEL && getAlternateAllele(0).isNull();
}
/**
* @return true if the alleles indicate neither a simple deletion nor a simple insertion
*/
public boolean isComplexIndel() {
return isIndel() && !isDeletion() && !isInsertion();
}
public boolean isSymbolic() {
return getType() == Type.SYMBOLIC;
}
public boolean isMNP() {
return getType() == Type.MNP;
}
/**
* convenience method for indels
*

View File

@ -83,7 +83,7 @@ public abstract class BaseTest {
*/
public static final String MD5_FILE_DB_SUBDIR = "integrationtests";
public static final String testDir = "testdata/";
public static final String testDir = "public/testdata/";
/** before the class starts up */
static {

View File

@ -60,8 +60,8 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine();
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
testEngine.setSAMFileIDs(samFiles);
testEngine.checkForDuplicateSamFiles();
@ -72,10 +72,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine();
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags()));
testEngine.setSAMFileIDs(samFiles);
testEngine.checkForDuplicateSamFiles();
@ -97,7 +97,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest {
GATKArgumentCollection argCollection = new GATKArgumentCollection();
testEngine.setArguments(argCollection);
File fastaFile = new File("testdata/exampleFASTA.fasta");
File fastaFile = new File("public/testdata/exampleFASTA.fasta");
GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile));
testEngine.setGenomeLocParser(genomeLocParser);

View File

@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e"));
Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
executeTest("test dbsnp", spec2);
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(

View File

@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" --NO_HEADER" +
" -o %s",
1,
Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877")
Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d")
);
executeTest("testSimpleVCFStreaming", spec);
@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" -EV CompOverlap -noEV -noST" +
" -o %s",
1,
Arrays.asList("f60729c900bc8368717653b3fad80d1e")
Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e"
);
executeTest("testVCFStreamingChain", selectTestSpec);

View File

@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.BeforeMethod;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.*;
/**
@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {
System.out.println(Utils.dupString('-', 40));
System.out.println("reads : " + new String(test.readBases));
ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
printQuals(System.out, "in-quals:", test.quals, false);
printQuals(System.out, "bq-quals:", result.bq, false);
for (int i = 0; i < test.quals.length; i++) {
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
}
}
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
out.print(prefix);
for ( int i = 0; i < quals.length; i++) {
if ( asInt ) {
out.printf("%2d", (int)quals[i]);
if ( i+1 != quals.length ) out.print(",");
} else
out.print((char)(quals[i]+33));
}
out.println();
}
}

View File

@ -21,7 +21,7 @@ public class BedParserUnitTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
private GenomeLocParser genomeLocParser;
private File bedFile = new File("testdata/sampleBedFile.bed");
private File bedFile = new File("public/testdata/sampleBedFile.bed");
@BeforeClass
public void beforeTests() {

View File

@ -39,7 +39,7 @@ import java.io.IOException;
*/
public class HapMapUnitTest {
// our sample hapmap file
private final static File hapMapFile = new File("testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap");
private final static File hapMapFile = new File("public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap");
private final static String knownLine = "rs2185539 C/T chr1 556738 + ncbi_b36 bbs urn:lsid:bbs.hapmap.org:Protocol:Phase3.r3:1 urn:lsid:bbs.hapmap.org:Assay:Phase3.r3_r" +
"s2185539:1 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:4 QC+ CC TC TT CT CC CC CC CC CC CC CC CC CC";
/**

View File

@ -17,8 +17,8 @@ import java.util.*;
*/
public class IndexFactoryUnitTest {
File inputFile = new File("testdata/HiSeq.10000.vcf");
File outputFile = new File("testdata/onTheFlyOutputTest.vcf");
File inputFile = new File("public/testdata/HiSeq.10000.vcf");
File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf");
File outputFileIndex = Tribble.indexFile(outputFile);
/**

View File

@ -49,12 +49,12 @@ public class ListFileUtilsUnitTest extends BaseTest {
public void testIgnoreBlankLinesInBAMListFiles() throws Exception {
File tempListFile = createTempListFile("testIgnoreBlankLines",
"",
"testdata/exampleBAM.bam",
"public/testdata/exampleBAM.bam",
" "
);
List<SAMReaderID> expectedBAMFileListAfterUnpacking = new ArrayList<SAMReaderID>();
expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking);
}
@ -63,13 +63,13 @@ public class ListFileUtilsUnitTest extends BaseTest {
public void testCommentSupportInBAMListFiles() throws Exception {
File tempListFile = createTempListFile("testCommentSupport",
"#",
"testdata/exampleBAM.bam",
"#testdata/foo.bam",
" # testdata/bar.bam"
"public/testdata/exampleBAM.bam",
"#public/testdata/foo.bam",
" # public/testdata/bar.bam"
);
List<SAMReaderID> expectedBAMFileListAfterUnpacking = new ArrayList<SAMReaderID>();
expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags()));
expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags()));
performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking);
}

View File

@ -29,7 +29,7 @@ public class GenomeLocProcessingTrackerUnitTest extends BaseTest {
IndexedFastaSequenceFile fasta = null;
GenomeLocParser genomeLocParser = null;
String chr1 = null;
private final static String FILE_ROOT = "testdata/GLPTFile";
private final static String FILE_ROOT = "public/testdata/GLPTFile";
@BeforeTest
public void before() {

View File

@ -1,15 +1,15 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.function.ListWriterFunction
import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord}
import scala.io.Source._
import collection.JavaConversions._
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
import org.broadinstitute.sting.queue.extensions.picard._
import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
import net.sf.samtools.SAMFileHeader.SortOrder
class DataProcessingPipeline extends QScript {
@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript {
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
var useBWApe: Boolean = false
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
var bwaThreads: Int = 1
/****************************************************************************
@ -180,6 +182,7 @@ class DataProcessingPipeline extends QScript {
var realignedBams: List[File] = List()
var index = 1
for (bam <- bams) {
val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" )
val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
@ -190,11 +193,12 @@ class DataProcessingPipeline extends QScript {
bwa_sam_se(bam, saiFile1, realignedSamFile))
}
else {
add(bwa_aln_pe(bam, saiFile1, 1),
bwa_aln_pe(bam, saiFile2, 2),
bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile))
add(sortSam(bam, readSortedBam, SortOrder.queryname),
bwa_aln_pe(readSortedBam, saiFile1, 1),
bwa_aln_pe(readSortedBam, saiFile2, 2),
bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile))
}
add(sortSam(realignedSamFile, realignedBamFile))
add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
realignedBams :+= rgRealignedBamFile
index = index + 1
@ -385,9 +389,10 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outBam + ".joinBams"
}
case class sortSam (inSam: File, outBam: File) extends SortSam {
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam {
this.input = List(inSam)
this.output = outBam
this.sortOrder = sortOrderP
this.memoryLimit = 4
this.isIntermediate = true
this.analysisName = queueLogDir + outBam + ".sortSam"
@ -430,7 +435,7 @@ class DataProcessingPipeline extends QScript {
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file") var sai = outSai
def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
}
@ -438,7 +443,7 @@ class DataProcessingPipeline extends QScript {
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
}

View File

@ -1,4 +1,4 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript

View File

@ -0,0 +1,91 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
import net.sf.samtools.SAMFileReader
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 4/20/11
* Time: 16:29 PM
*/
class RecalibrateBaseQualities extends QScript {
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
var GATKjar: File = _
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
var input: File = _
@Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false)
var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R")
@Input(doc="Reference fasta file", shortName="R", required=false)
var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false)
var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val queueLogDir: String = ".qlog/"
var nContigs: Int = 0
def getNumberOfContigs(bamFile: File): Int = {
val samReader = new SAMFileReader(new File(bamFile))
return samReader.getFileHeader.getSequenceDictionary.getSequences.size()
}
def script = {
nContigs = getNumberOfContigs(input)
val recalFile1: File = new File("recal1.csv")
val recalFile2: File = new File("recal2.csv")
val recalBam: File = swapExt(input, ".bam", "recal.bam")
val path1: String = "before"
val path2: String = "after"
add(cov(input, recalFile1),
recal(input, recalFile1, recalBam),
cov(recalBam, recalFile2),
analyzeCovariates(recalFile1, path1),
analyzeCovariates(recalFile2, path2))
}
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = GATKjar
this.reference_sequence = reference
this.memoryLimit = 4
this.isIntermediate = true
}
case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= inBam
this.recal_file = outRecalFile
this.analysisName = queueLogDir + outRecalFile + ".covariates"
this.jobName = queueLogDir + outRecalFile + ".covariates"
this.scatterCount = nContigs
}
case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
this.input_file :+= inBam
this.recal_file = inRecalFile
this.out = outBam
this.isIntermediate = false
this.analysisName = queueLogDir + outBam + ".recalibration"
this.jobName = queueLogDir + outBam + ".recalibration"
this.scatterCount = nContigs
}
case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates {
this.resources = R
this.recal_file = inRecalFile
this.output_dir = outPath.toString
this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"
this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
}
}

View File

@ -1,4 +1,4 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk.RodBind

View File

@ -1,211 +0,0 @@
package org.broadinstitute.sting.scala
/**
import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall}
import java.io.File
import net.sf.samtools.SAMRecord
import org.broadinstitute.sting.gatk.walkers.LocusWalker
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
import org.broadinstitute.sting.utils.Pair
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData
import utils._
import cmdLine.Argument
class TransitionTable() {
var table: Array[Array[Double]] = new Array[Array[Double]](4,4)
def makeKey(b: Char): Int = {
return BaseUtils.simpleBaseToBaseIndex(b)
}
def fromKey(i: Int): Char = {
return BaseUtils.baseIndexToSimpleBase(i)
}
def add(ref: Char, subst: Char) = {
table(makeKey(ref))(makeKey(subst)) += 1
}
def get(i: Int, j: Int): Double = {
return table(i)(j)
}
def set(i: Int, j: Int, d: Double) = {
//printf("Setting %d / %d => %f%n", i, j, d)
table(i)(j) = d
}
def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = {
return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j))
}
def header(): String = {
//return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray)
return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray)
}
override def toString(): String = {
return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray)
}
def sum(): Double = {
return sumList(table.toList map (x => sumList(x.toList)))
}
def sumList(xs: List[Double]): Double = {
return (0.0 :: xs) reduceLeft {(x, y) => x + y}
}
def getRateMatrix(): TransitionTable = {
var sums = (table.toList map (x => sumList(x.toList))).toArray
var t = new TransitionTable()
//println(sums.toList)
this mapEntries ((i,j,x) => t.set(i, j, x / sums(i)))
return t
}
}
**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] {
private var MIN_MAPPING_QUALITY = 30
private var MIN_BASE_QUALITY = 20
private var MIN_LOD = 5
private var PRINT_FREQ = 1000000
private var CARE_ABOUT_STRAND = true
private val SSG = new UnifiedGenotyper()
private val table1 = new TransitionTable()
private val tableFWD = new TransitionTable()
private val tableREV = new TransitionTable()
private val table2 = new TransitionTable()
@Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false }
var VERBOSE: Boolean = false
override def initialize() {
SSG.initialize();
}
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = {
val callPair = SSG.map(tracker, ref, context)
//val call = callPair.getFirst().get(0)
val pileup = new ReadBackedPileup(ref.getBase(), context)
def hasNoNs(): Boolean = {
return ! (pileup.getBases() exists ('N' ==))
}
if ( isDefinitelyHomRef(callPair) && hasNoNs() ) {
var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases)
//printf("nonRefBases %s%n", nonRefBases)
if ( nonRefBases.length > 0 ) {
var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup)
var nonRefRead: SAMRecord = nonRefReads.head
nonRefBases.length match {
case 1 =>
//if ( goodRead(nonRefRead,offsets.head) ) {
//printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
//}
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1)
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD )
case 2 =>
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2)
addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2)
case x =>
}
if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 )
printNonRefBases(ref, nonRefReads, offsets, context.getReads.size())
}
}
}
implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*)
def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = {
out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth)
}
def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = {
if ( goodRead(nonRefRead, offset) ) {
//printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
//println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality)
if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) {
// it's on the reverse strand
val refComp: Char = BaseUtils.simpleComplement(ref.getBase)
val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase)
//printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp)
table.add(refComp, nonRefComp)
} else {
table.add(ref.getBase, nonRefBase)
}
}
}
def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = {
def getBase(i: Int): Char = {
var offset = pileup.getOffsets().get(i)
return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int])
}
var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i
var reads = subset map (i => pileup.getReads().get(i))
var offsets = subset map (i => pileup.getOffsets().get(i))
//println(reads, offsets, subset)
return (reads, offsets)
}
def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = {
splitBases(refBase, pileup) match {
case (refs, nonRefs) =>
//printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString)
return nonRefs
}
}
def goodRead( read: SAMRecord, offset: Integer ): Boolean = {
return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY
}
def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = {
val refBases = pileup.toList filter (refBase ==)
val nonRefBases = pileup.toList filter (refBase !=)
return (refBases, nonRefBases)
}
def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = {
if ( call == null )
return false;
return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD
}
def reduceInit(): Int = {
return 0;
}
def reduce(m: Unit, r: Int): Int = {
return r;
}
override def onTraversalDone(r: Int) = {
out.println("-------------------- FINAL RESULT --------------------")
out.println("type\t" + table1.header)
def print1(t: String, x: TransitionTable) = {
out.println(t + "\t" + x)
}
print1("1-mismatch", table1)
print1("2-mismatch", table2)
print1("1-mismatch-fwd", tableFWD)
print1("1-mismatch-rev", tableREV)
}
}
*/

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.scala
import java.io.PrintStream
import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext}
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import collection.JavaConversions._
import collection.immutable.List
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature
import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker}
import org.broadinstitute.sting.commandline.{Output, Argument}
import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc}
import collection.mutable.{ListBuffer, HashSet}
import java.lang.Math
class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] {
@Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb")
var minPropOverlap : Double = 0.50 // default to 50%
@Output var out : PrintStream = _
override def reduceInit : List[IntervalInfoBuilder] = {
Nil
}
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = {
val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList
val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList
val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList
var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase)
if ( refSeqData != null ) {
amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature])
}
if ( tableMetaData != null ) {
amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) =>
new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _)))
}
amd.newIvals = ivalsOfInterest.map(_.getLocation)
return amd
}
override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = {
rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done)))
val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) :::
mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) )
newList.foreach(_.update(mapMetaData))
return newList
}
override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = {
finalList.foreach(u => out.print("%s%n".format(u.done)))
}
}
class AnnotationMetaData(loc: GenomeLoc, ref: Byte) {
val locus : GenomeLoc = loc
var newIvals : List[GenomeLoc] = Nil
var refSeqFeatures : List[RefSeqFeature] = Nil
var tableFeatures : List[(GenomeLoc,String)] = Nil
val refBase : Byte = ref
}
class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) {
val OVERLAP_PROP : Double = minProp
var metaData : HashSet[String] = new HashSet[String]
var seenTranscripts : HashSet[String] = new HashSet[String]
var baseContent : ListBuffer[Byte] = new ListBuffer[Byte]
var location : GenomeLoc = loc
var gcContent : Double = _
var entropy : Double = _
var finalized : Boolean = false
def update(mmd : AnnotationMetaData) = {
baseContent += mmd.refBase
mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u))
mmd.tableFeatures.filter( (t : (GenomeLoc,String)) =>
! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2))
}
def addRefSeq( rs: RefSeqFeature ) : Unit = {
seenTranscripts.add(rs.getTranscriptUniqueGeneName)
val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp )
if ( exons.size > 0 ) {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _)))
} else {
if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR"))
} else {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron"))
}
}
}
def done : String = {
finalized = true
def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 }
gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size()
entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b)))
val meta : String = metaData.reduceLeft(_ + "\t" + _)
return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta)
}
def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = {
if(byteList.size == 1) return 0
Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) +
calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => {
if ( b.equals(byteList(1)) ) {
a.dropRight(1) :+ (a.last ++ b)
} else {
a :+ b
}
}))
}
}

View File

@ -1,24 +0,0 @@
package org.broadinstitute.sting.scala
import org.broadinstitute.sting.gatk.walkers.LocusWalker
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
class ScalaCountLoci extends LocusWalker[Int,Int] {
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = {
return 1
}
def reduceInit(): Int = {
return 0;
}
def reduce(m: Int, r: Int): Int = {
return m + r;
}
def main(args: Array[String]) {
println("Hello, world!")
}
}