diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R new file mode 100644 index 000000000..9f9ee290c --- /dev/null +++ b/public/R/plot_Annotations_BinnedTruthMetrics.R @@ -0,0 +1,190 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] +minBinCutoff = as.numeric(args[3]) +medianNumVariants = args[4] + +c <- read.table(input, header=T) + +all = c[c$numVariants>minBinCutoff & c$category=="all",] +novel = c[c$numVariants>minBinCutoff & c$category=="novel",] +dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] + +# +# Calculate min, max, medians +# + +d = c[c$numVariants>minBinCutoff,] +ymin = min(d$titv) +ymax = max(d$titv) +xmin = min(d$value) +xmax = max(d$value) +m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) +ma = all[all$value > m,] +mb = all[all$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +if(medianNumVariants == "true") { +vc = cumsum( all$numVariants/sum(all$numVariants) ) +m10 = all$value[ max(which(vc<=0.10)) ] +m25 = all$value[ max(which(vc<=0.25)) ] +m = all$value[ max(which(vc<=0.5)) ] +m75 = all$value[ min(which(vc>=0.75)) ] +m90 = all$value[ min(which(vc>=0.90)) ] +} + +# +# Plot TiTv ratio as a function of the annotation +# + +outfile = paste(input, ".TiTv.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot TiTv ratio as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".TiTv_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation +# + +ymin = min(all$dbsnp) +ymax = max(all$dbsnp) +outfile = paste(input, ".truthRate.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +t = all[all$truePositive>0,] +yLabel = "DBsnp/True Positive Rate" +ymin = min(min(all$dbsnp),min(t$truePositive)) +ymax = max(max(all$dbsnp),max(t$truePositive)) +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot histogram of the annotation's value +# + +outfile = paste(input, ".Histogram.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() + +# +# Plot histogram of the annotation's value, log scale on x-axis +# + +outfile = paste(input, ".Histogram_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R new file mode 100755 index 000000000..a79ddd3ab --- /dev/null +++ b/public/R/plot_Tranches.R @@ -0,0 +1,87 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +tranchesFile = args[1] +targetTITV = as.numeric(args[2]) +targetSensitivity = as.numeric(args[3]) +suppressLegend = ! is.na(args[4]) + +# ----------------------------------------------------------------------------------------------- +# Useful general routines +# ----------------------------------------------------------------------------------------------- + +MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate + +titvFPEst <- function(titvExpected, titvObserved) { + max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) +} + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +nTPFP <- function(nVariants, FDR) { + return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100))) +} + +leftShift <- function(x, leftValue = 0) { + r = rep(leftValue, length(x)) + for ( i in 1:(length(x)-1) ) { + #print(list(i=i)) + r[i] = x[i+1] + } + r +} + +# ----------------------------------------------------------------------------------------------- +# Tranches plot +# ----------------------------------------------------------------------------------------------- +data2 = read.table(tranchesFile,sep=",",head=T) +data2 = data2[order(data2$novelTiTv, decreasing=F),] +#data2 = data2[order(data2$FDRtranche, decreasing=T),] +cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") +density=c(20, -1, -1, 20) +outfile = paste(tranchesFile, ".pdf", sep="") +pdf(outfile, height=5, width=8) +par(mar = c(5, 5, 4, 2) + 0.1) +novelTiTv = c(data2$novelTITV,data2$novelTiTv) +alpha = 1 - titvFPEstV(targetTITV, novelTiTv) +#print(alpha) + +numGood = round(alpha * data2$numNovel); + +#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100)) +numBad = data2$numNovel - numGood; + +numPrevGood = leftShift(numGood, 0) +numNewGood = numGood - numPrevGood +numPrevBad = leftShift(numBad, 0) +numNewBad = numBad - numPrevBad + +d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE) +#print(d) +barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000)) +#abline(v= d[2,dim(d)[2]], lty=2) +#abline(v= d[1,3], lty=2) +if ( ! suppressLegend ) + legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) + +mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0) +axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) + +# plot sensitivity vs. specificity +sensitivity = data2$truthSensitivity +if ( ! is.null(sensitivity) ) { + #specificity = titvFPEstV(targetTITV, novelTiTv) + specificity = novelTiTv + plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)") + abline(h=targetTITV, lty=2) + abline(v=targetSensitivity, lty=2) + #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2) +} + +dev.off() diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R new file mode 100644 index 000000000..a1385ff3f --- /dev/null +++ b/public/R/plot_residualError_OtherCovariate.R @@ -0,0 +1,108 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +covariateName = args[2] + +outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +c <- c[sort.list(c[,1]),] + +# +# Plot residual error as a function of the covariate +# + +d.good <- c[c$nBases >= 1000,] +d.1000 <- c[c$nBases < 1000,] +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(d.good$nBases) == length(c$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)) +} +# Don't let residual error go off the edge of the plot +d.good$residualError = d.good$Qempirical-d.good$Qreported +d.good$residualError[which(d.good$residualError > 10)] = 10 +d.good$residualError[which(d.good$residualError < -10)] = -10 +d.1000$residualError = d.1000$Qempirical-d.1000$Qreported +d.1000$residualError[which(d.1000$residualError > 10)] = 10 +d.1000$residualError[which(d.1000$residualError < -10)] = -10 +c$residualError = c$Qempirical-c$Qreported +c$residualError[which(c$residualError > 10)] = 10 +c$residualError[which(c$residualError < -10)] = -10 +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") +} +dev.off() + + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + +# +# Plot histogram of the covariate +# + +e = d.good +f = d.1000 +outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) + +lwdSize=2 +if( length(c$Covariate) <= 20 ) { + lwdSize=7 +} else if( length(c$Covariate) <= 70 ) { + lwdSize=4 +} + +if( is.numeric(c$Covariate) ) { + if( length(hst$e.Covariate) == 0 ) { + plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + } else { + plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) + plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") + if( length(hst$c.Covariate) > 9 ) { + axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) + } else { + axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} +dev.off() diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R new file mode 100644 index 000000000..81bc9460d --- /dev/null +++ b/public/R/plot_residualError_QualityScoreCovariate.R @@ -0,0 +1,70 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) + +input = args[1] +Qcutoff = as.numeric(args[2]) +maxQ = as.numeric(args[3]) +maxHist = as.numeric(args[4]) + +t=read.table(input, header=T) + +# +# Plot of reported quality versus empirical quality +# + +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] +d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] +d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] +f <- t[t$Qreported < Qcutoff,] +e <- rbind(d.good, d.1000, d.10000) +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)); +} +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) +points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) +points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) +abline(0,1, lty=2) +dev.off() + +# +# Plot Q empirical histogram +# + +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) +percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) +entropy = -sum(log2(percentBases)*percentBases) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# + +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R new file mode 100755 index 000000000..7af5e8bbb --- /dev/null +++ b/public/R/titvFPEst.R @@ -0,0 +1,138 @@ +titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { + TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel + 2 * TP / 3 / callable +} + +marginalTiTv <- function( nx, titvx, ny, titvy ) { + tvx = nx / (titvx + 1) + tix = nx - tvx + tvy = ny / (titvy + 1) + tiy = ny - tvy + tiz = tix - tiy + tvz = tvx - tvy + return(tiz / tvz) +} +marginaldbSNPRate <- function( nx, dbx, ny, dby ) { + knownx = nx * dbx / 100 + novelx = nx - knownx + knowny = ny * dby / 100 + novely = ny - knowny + knownz = knownx - knowny + novelz = novelx - novely + return(knownz / ( knownz + novelz ) * 100) +} + +numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { + nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) + return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) +} + +normalize <- function(x) { + x / sum(x) +} + +normcumsum <- function(x) { + cumsum(normalize(x)) +} + +cumhist <- function(d, ...) { + plot(d[order(d)], type="b", col="orange", lwd=2, ...) +} + +revcumsum <- function(x) { + return(rev(cumsum(rev(x)))) +} + +phred <- function(x) { + log10(max(x,10^(-9.9)))*-10 +} + +pOfB <- function(b, B, Q) { + #print(paste(b, B, Q)) + p = 1 - 10^(-Q/10) + if ( b == B ) + return(p) + else + return(1 - p) +} + +pOfG <- function(bs, qs, G) { + a1 = G[1] + a2 = G[2] + + log10p = 0 + for ( i in 1:length(bs) ) { + b = bs[i] + q = qs[i] + p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 + log10p = log10p + log10(p1) + } + + return(log10p) +} + +pOfGs <- function(nAs, nBs, Q) { + bs = c(rep("a", nAs), rep("t", nBs)) + qs = rep(Q, nAs + nBs) + G1 = c("a", "a") + G2 = c("a", "t") + G3 = c("t", "t") + + log10p1 = pOfG(bs, qs, G1) + log10p2 = pOfG(bs, qs, G2) + log10p3 = pOfG(bs, qs, G3) + Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) + + return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) +} + +QsampleExpected <- function(depth, Q) { + weightedAvg = 0 + for ( d in 1:(depth*3) ) { + Qsample = 0 + pOfD = dpois(d, depth) + for ( nBs in 0:d ) { + pOfnB = dbinom(nBs, d, 0.5) + nAs = d - nBs + Qsample = pOfGs(nAs, nBs, Q)$Qsample + #Qsample = 1 + weightedAvg = weightedAvg + Qsample * pOfD * pOfnB + print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) + } + } + + return(weightedAvg) +} + +plotQsamples <- function(depths, Qs, Qmax) { + cols = rainbow(length(Qs)) + plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") + + for ( i in 1:length(Qs) ) { + Q = Qs[i] + y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) + points(depths, y, col=cols[i], type="b") + } + + legend("topleft", paste("Q", Qs), fill=cols) +} + +pCallHetGivenDepth <- function(depth, nallelesToCall) { + depths = 0:(2*depth) + pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) + dpois(depths,depth)*(1-pNoAllelesToCall) +} + +pCallHets <- function(depth, nallelesToCall) { + sum(pCallHetGivenDepth(depth,nallelesToCall)) +} + +pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { + 1-(1-pCallHets(depth,nallelesToCall))^nsamples +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 28b88f9f2..a53665d64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -798,11 +798,11 @@ public class IndelRealigner extends ReadWalker { private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { for ( VariantContext knownIndel : knownIndelsToTry ) { - if ( knownIndel == null || !knownIndel.isIndel() ) + if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) continue; byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); int start = knownIndel.getStart() - leftmostIndex + 1; - Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion()); + Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); if ( c != null ) altConsensesToPopulate.add(c); } @@ -988,7 +988,7 @@ public class IndelRealigner extends ReadWalker { } // create a Consensus from just the indel string that falls on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final boolean isDeletion) { + private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { if ( indexOnRef < 0 || indexOnRef >= reference.length ) return null; @@ -1002,14 +1002,16 @@ public class IndelRealigner extends ReadWalker { if ( indexOnRef > 0 ) cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - if ( isDeletion ) { + if ( indel.isDeletion() ) { refIdx += indelStr.length; cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); } - else { + else if ( indel.isInsertion() ) { for ( byte b : indelStr ) sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); + } else { + throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); } if ( reference.length - refIdx > 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java new file mode 100755 index 000000000..fc196e712 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.File; +import java.util.*; + +@By(DataSource.READS) +// walker to count realigned reads +public class RealignedReadCounter extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + + @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + protected String intervalsFile = null; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; + private boolean intervalWasUpdated = false; + + public void initialize() { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + return 0; + } + + GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) + return 0; + + if ( readLoc.overlapsP(currentInterval) ) { + if ( doNotTryToClean(read) ) + return 0; + + if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { + String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); + // deal with an old bug + if ( read.getCigar().toString().equals(newCigar) ) { + //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); + return 0; + } + + if ( !intervalWasUpdated ) { + intervalWasUpdated = true; + updatedIntervals++; + affectedBases += 20 + getIndelSize(read); + } + updatedReads++; + + } + } else { + do { + intervalWasUpdated = false; + currentInterval = intervals.hasNext() ? intervals.next() : null; + } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + } + + return 0; + } + + private int getIndelSize(SAMRecord read) { + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.I ) + return 0; + if ( ce.getOperator() == CigarOperator.D ) + return ce.getLength(); + } + logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); + return 0; + } + + private boolean doNotTryToClean(SAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + (BadMateFilter.hasBadMate(read)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(updatedIntervals + " intervals were updated"); + System.out.println(updatedReads + " reads were updated"); + System.out.println(affectedBases + " bases were affected"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java index 7989f52db..81d9b4ddb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java @@ -161,7 +161,7 @@ public class AnnotateMNPsWalker extends RodWalker { boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart(); boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop(); - if (vc.getType() == VariantContext.Type.MNP) { + if (vc.isMNP()) { logger.debug("Observed MNP at " + vcLoc); if (isChrM(vc)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java new file mode 100755 index 000000000..feb5f62af --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.List; +import java.io.PrintStream; + +/** + * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but + * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. + * This was its very first use. + */ +public class CountIntervals extends RefWalker { + @Output + PrintStream out; + + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) + int numOverlaps = 2; + + public Long reduceInit() { + return 0l; + } + + public boolean isReduceByInterval() { return true; } + + public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) { + return null; + } + + List checkIntervals = tracker.getGATKFeatureMetaData("check",false); + return (long) checkIntervals.size(); + } + + public Long reduce(Long loc, Long prev) { + if ( loc == null ) { + return 0l; + } else { + return Math.max(prev,loc); + } + } + + public void onTraversalDone(List> finalReduce) { + long count = 0; + for ( Pair g : finalReduce ) { + if ( g.second >= numOverlaps) { + count ++; + } + } + out.printf("Number of contiguous intervals: %d",count); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java deleted file mode 100755 index 67879a442..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SimpleTimer; - -import java.io.PrintStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value={}) -public class ProfileRodSystem extends RodWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false) - int nIterations = 1; - - @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) - int MAX_RECORDS = -1; - - SimpleTimer timer = new SimpleTimer("myTimer"); - - public void initialize() { - File rodFile = getRodFile(); - - out.printf("# walltime is in seconds%n"); - out.printf("# file is %s%n", rodFile); - out.printf("# file size is %d bytes%n", rodFile.length()); - out.printf("operation\titeration\twalltime%n"); - for ( int i = 0; i < nIterations; i++ ) { - out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE)); - out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE)); - out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS)); - out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC)); - out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE)); - } - - timer.start(); // start up timer for map itself - } - - private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE }; - - private final double readFile(File f, ReadMode mode) { - timer.start(); - - try { - byte[] data = new byte[100000]; - FileInputStream s = new FileInputStream(f); - - if ( mode == ReadMode.BY_BYTE ) { - while (true) { - if ( s.read(data) == -1 ) - break; - } - } else { - int counter = 0; - VCFCodec codec = new VCFCodec(); - String[] parts = new String[100000]; - AsciiLineReader lineReader = new AsciiLineReader(s); - - if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE ) - codec.readHeader(lineReader); - - while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) { - String line = lineReader.readLine(); - if ( line == null ) - break; - else if ( mode == ReadMode.BY_PARTS ) { - ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR); - } - else if ( mode == ReadMode.DECODE_LOC ) { - codec.decodeLoc(line); - } - else if ( mode == ReadMode.DECODE ) { - processOneVC((VariantContext)codec.decode(line)); - } - } - } - } catch ( Exception e ) { - throw new RuntimeException(e); - } - - return timer.getElapsedTime(); - } - - private File getRodFile() { - List rods = this.getToolkit().getRodDataSources(); - ReferenceOrderedDataSource rod = rods.get(0); - return rod.getFile(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation()); - processOneVC(vc); - - return 0; - } - - private static final void processOneVC(VariantContext vc) { - vc.getNSamples(); // force us to parse the samples - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java new file mode 100644 index 000000000..9cb715507 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Collection; +import java.util.List; + +/** + * a walker for validating (in the style of validating pile-up) the ROD system. + */ +@Reference(window=@Window(start=-40,stop=40)) +public class RodSystemValidationWalker extends RodWalker { + + // the divider to use in some of the text output + private static final String DIVIDER = ","; + + @Output + public PrintStream out; + + @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts") + public boolean allRecordsVariantContextEquivalent = false; + + // used to calculate the MD5 of a file + MessageDigest digest = null; + + // we sometimes need to know what rods the engine's seen + List rodList; + + /** + * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change + * underneath us). + */ + public void initialize() { + // setup the MD5-er + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new ReviewedStingException("Unable to find MD5 checksumer"); + } + out.println("Header:"); + // enumerate the list of ROD's we've loaded + rodList = this.getToolkit().getRodDataSources(); + for (ReferenceOrderedDataSource rod : rodList) { + out.println(rod.getName() + DIVIDER + rod.getType()); + out.println(rod.getName() + DIVIDER + rod.getFile()); + out.println(rod.getName() + DIVIDER + md5sum(rod.getFile())); + } + out.println("Data:"); + } + + /** + * + * @param tracker the ref meta data tracker to get RODs + * @param ref reference context + * @param context the reads + * @return an 1 for each site with a rod(s), 0 otherwise + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int ret = 0; + if (tracker != null && tracker.getAllRods().size() > 0) { + out.print(context.getLocation() + DIVIDER); + Collection features = tracker.getAllRods(); + for (GATKFeature feat : features) + out.print(feat.getName() + DIVIDER); + out.println(";"); + ret++; + } + + // if the argument was set, check for equivalence + if (allRecordsVariantContextEquivalent && tracker != null) { + Collection col = tracker.getAllVariantContexts(ref); + VariantContext con = null; + for (VariantContext contextInList : col) + if (con == null) con = contextInList; + else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con); + } + return ret; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + // Double check traversal result to make count is the same. + // TODO: Is this check necessary? + out.println("[REDUCE RESULT] Traversal result is: " + result); + } + + // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html + private String md5sum(File f) { + InputStream is; + try { + is = new FileInputStream(f); + } catch (FileNotFoundException e) { + return "Not a file"; + } + byte[] buffer = new byte[8192]; + int read = 0; + try { + while ((read = is.read(buffer)) > 0) { + digest.update(buffer, 0, read); + } + byte[] md5sum = digest.digest(); + BigInteger bigInt = new BigInteger(1, md5sum); + return bigInt.toString(16); + } + catch (IOException e) { + throw new RuntimeException("Unable to process file for MD5", e); + } + finally { + try { + is.close(); + } + catch (IOException e) { + throw new RuntimeException("Unable to close input stream for MD5 calculation", e); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java deleted file mode 100755 index eda01bdb4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.PrintStream; - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER) -@Reference(window=@Window(start=-5,stop=5)) -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -public class ValidateBAQWalker extends ReadWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(doc="maximum read length to apply the BAQ calculation too",required=false) - protected int maxReadLen = 1000; - - @Argument(doc="",required=false) - protected int bw = 7; - - @Argument(doc="",required=false) - protected boolean samtoolsMode = false; - - @Argument(doc="only operates on reads with this name",required=false) - protected String readName = null; - - @Argument(doc="If true, all differences are errors", required=false) - protected boolean strict = false; - - @Argument(doc="prints info for each read", required=false) - protected boolean printEachRead = false; - - @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false) - protected boolean alsoPrintWarnings = false; - - @Argument(doc="Include reads without BAQ tag", required=false) - protected boolean includeReadsWithoutBAQTag = false; - - @Argument(doc="x each read is processed", required=false) - protected int magnification = 1; - - @Argument(doc="Profile performance", required=false) - protected boolean profile = false; - - int counter = 0; - - BAQ baqHMM = null; // matches current samtools parameters - - public void initialize() { - if ( samtoolsMode ) - baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true); - else - baqHMM = new BAQ(); - } - - long goodReads = 0, badReads = 0; - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - - if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) { - if ( baqHMM.excludeReadFromBAQ(read) ) - return 0; - - if ( profile ) { - profileBAQ(ref, read); - } else { - validateBAQ(ref, read); - } - - return 1; - } - - return 0; - } - - SimpleTimer tagTimer = new SimpleTimer("from.tag"); - SimpleTimer baqReadTimer = new SimpleTimer("baq.read"); - SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal"); - - private void profileBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - BAQ.BAQCalculationResult baq = null; - - tagTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); } - tagTimer.stop(); - - baqReadTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); } - baqReadTimer.stop(); - - glocalTimer.restart(); - for ( int i = 0; i < magnification; i++ ) - baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY); - glocalTimer.stop(); - } - - - private void validateBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); - if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); - - BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); - - boolean fail = false; - boolean print = false; - int badi = 0; - - if ( BAQ.hasBAQTag(read) ) { - for ( badi = 0; badi < baqFromTag.length; badi++ ) { - if ( baqFromTag[badi] != baq.bq[badi] ) { - if ( cigarLength(read) != read.getReadLength() ) { - print = true; - fail = false; - out.printf(" different, but cigar length != read length%n"); - break; - } - if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { - print = true; - fail = strict; - out.printf(" different, but Q0 base detected%n"); - break; - } - else if (readHasSoftClip(read) && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but soft clip detected%n"); - break; - } else if (readHasDeletion(read) ) { // && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but deletion detected%n"); - break; - } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { - print = fail = true; - out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); - break; - } else { - print = fail = true; - break; - } - } - } - if ( fail || print ) - badReads++; - else - goodReads++; - } - - if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) { - byte[] pos = new byte[baq.bq.length]; - for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i; - - out.printf(" read length : %d%n", read.getReadLength()); - out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart()); - out.printf(" cigar : %s%n", read.getCigarString()); - out.printf(" ref bases : %s%n", new String(baq.refBases)); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.printf(" ref length : %d%n", baq.refBases.length); - out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); - if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true); - printQuals(" original quals: ", read.getBaseQualities(), true); - printQuals(" baq quals: ", baq.bq, true); - printQuals(" positions : ", pos, true); - printQuals(" original quals: ", read.getBaseQualities()); - if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag); - printQuals(" hmm quals: ", baq.bq); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.println(Utils.dupString('-', 80)); - } - - - if ( fail ) - throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d", - read.getReadName(), badi, baqFromTag[badi], baq.bq[badi])); - } - - private final static boolean readHasSoftClip(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.SOFT_CLIP ) - return true; - } - - return false; - } - - private final static boolean readHasDeletion(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.DELETION ) - return true; - } - - return false; - } - - public final void printQuals( String prefix, byte[] quals ) { - printQuals(prefix, quals, false); - } - - public final void printQuals( String prefix, byte[] quals, boolean asInt ) { - printQuals(out, prefix, quals, asInt); - } - - public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { - out.print(prefix); - for ( int i = 0; i < quals.length; i++) { - if ( asInt ) { - out.printf("%2d", (int)quals[i]); - if ( i+1 != quals.length ) out.print(","); - } else - out.print((char)(quals[i]+33)); - } - out.println(); - } - - /** - * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. - * @param read - * @return - */ - public static byte[] getBAQDeltas(SAMRecord read) { - byte[] baq = BAQ.getBAQTag(read); - if ( baq != null ) { - byte[] deltas = new byte[baq.length]; - for ( int i = 0; i < deltas.length; i++) - deltas[i] = (byte)(-1 * (baq[i] - 64)); - return deltas; - } else - return null; - } - - private int cigarLength(SAMRecord read) { - int readI = 0; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - return 0; - case H : case P : // ignore pads and hard clips - break; - case S : - case I : - readI += l; - break; - case D : break; - case M : - readI += l; - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - return readI; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer nreads) { - if ( profile ) { - out.printf("n.reads baq.per.read calculation time.in.secs%n"); - printTimer(nreads, tagTimer); - printTimer(nreads, glocalTimer); - printTimer(nreads, baqReadTimer); - } else { - out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads, - goodReads, (100.0 * goodReads) / nreads, - badReads, (100.0 * badReads) / nreads); - } - } - - private final void printTimer(int nreads, SimpleTimer timer) { - out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime()); - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java index f350e7531..fde233b5d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java @@ -209,7 +209,7 @@ public class PickSequenomProbes extends RodWalker { String assay_sequence; if ( vc.isSNP() ) assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; - else if ( vc.getType() == VariantContext.Type.MNP ) + else if ( vc.isMNP() ) assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1); else if ( vc.isInsertion() ) assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index ec7395f85..1bd73414c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -113,7 +113,7 @@ public class ValidateVariants extends RodWalker { observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); } // deletions - else if ( vc.isDeletion() || vc.isMixed() || vc.getType() == VariantContext.Type.MNP ) { + else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) { // we can't validate arbitrarily long deletions if ( reportedRefAllele.length() > 100 ) { logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); @@ -123,7 +123,7 @@ public class ValidateVariants extends RodWalker { // deletions are associated with the (position of) the last (preceding) non-deleted base; // hence to get actually deleted bases we need offset = 1 int offset = 1 ; - if ( vc.getType() == VariantContext.Type.MNP ) offset = 0; // if it's an MNP, the reported position IS the first modified base + if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base byte[] refBytes = ref.getBases(); byte[] trueRef = new byte[reportedRefAllele.length()]; for (int i = 0; i < reportedRefAllele.length(); i++) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 87466e21b..3d375aba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); } @@ -566,10 +566,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati return getType() == Type.INDEL && getAlternateAllele(0).isNull(); } + /** + * @return true if the alleles indicate neither a simple deletion nor a simple insertion + */ + public boolean isComplexIndel() { + return isIndel() && !isDeletion() && !isInsertion(); + } + public boolean isSymbolic() { return getType() == Type.SYMBOLIC; } + public boolean isMNP() { + return getType() == Type.MNP; + } + /** * convenience method for indels * diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 07f46c1c3..61bb8b34b 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -83,7 +83,7 @@ public abstract class BaseTest { */ public static final String MD5_FILE_DB_SUBDIR = "integrationtests"; - public static final String testDir = "testdata/"; + public static final String testDir = "public/testdata/"; /** before the class starts up */ static { diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index a17c162f5..1e4625bf0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -60,8 +60,8 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); testEngine.setSAMFileIDs(samFiles); testEngine.checkForDuplicateSamFiles(); @@ -72,10 +72,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); testEngine.setSAMFileIDs(samFiles); testEngine.checkForDuplicateSamFiles(); @@ -97,7 +97,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GATKArgumentCollection argCollection = new GATKArgumentCollection(); testEngine.setArguments(argCollection); - File fastaFile = new File("testdata/exampleFASTA.fasta"); + File fastaFile = new File("public/testdata/exampleFASTA.fasta"); GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); testEngine.setGenomeLocParser(genomeLocParser); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index acfcbab7b..4b225aaea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e")); + Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 15733e104..cf0673ee6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877") + Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d") ); executeTest("testSimpleVCFStreaming", spec); @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") + Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 08be4f742..f4c3163cf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.BeforeMethod; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.Utils; import java.io.File; import java.io.FileNotFoundException; -import java.util.Arrays; +import java.io.PrintStream; import java.util.List; import java.util.ArrayList; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.*; /** @@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest { System.out.println(Utils.dupString('-', 40)); System.out.println("reads : " + new String(test.readBases)); - ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false); - ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + printQuals(System.out, "in-quals:", test.quals, false); + printQuals(System.out, "bq-quals:", result.bq, false); for (int i = 0; i < test.quals.length; i++) { //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); @@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest { } } + + public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { + out.print(prefix); + for ( int i = 0; i < quals.length; i++) { + if ( asInt ) { + out.printf("%2d", (int)quals[i]); + if ( i+1 != quals.length ) out.print(","); + } else + out.print((char)(quals[i]+33)); + } + out.println(); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java index e592c99b9..56bf66f53 100644 --- a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java @@ -21,7 +21,7 @@ public class BedParserUnitTest extends BaseTest { private static IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; - private File bedFile = new File("testdata/sampleBedFile.bed"); + private File bedFile = new File("public/testdata/sampleBedFile.bed"); @BeforeClass public void beforeTests() { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java index 2e618d5eb..e912f97e9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java @@ -39,7 +39,7 @@ import java.io.IOException; */ public class HapMapUnitTest { // our sample hapmap file - private final static File hapMapFile = new File("testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap"); + private final static File hapMapFile = new File("public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap"); private final static String knownLine = "rs2185539 C/T chr1 556738 + ncbi_b36 bbs urn:lsid:bbs.hapmap.org:Protocol:Phase3.r3:1 urn:lsid:bbs.hapmap.org:Assay:Phase3.r3_r" + "s2185539:1 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:4 QC+ CC TC TT CT CC CC CC CC CC CC CC CC CC"; /** diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 3929ff388..2f6b589f4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -17,8 +17,8 @@ import java.util.*; */ public class IndexFactoryUnitTest { - File inputFile = new File("testdata/HiSeq.10000.vcf"); - File outputFile = new File("testdata/onTheFlyOutputTest.vcf"); + File inputFile = new File("public/testdata/HiSeq.10000.vcf"); + File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); /** diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java index f85a2f599..f0b1de6fe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java @@ -49,12 +49,12 @@ public class ListFileUtilsUnitTest extends BaseTest { public void testIgnoreBlankLinesInBAMListFiles() throws Exception { File tempListFile = createTempListFile("testIgnoreBlankLines", "", - "testdata/exampleBAM.bam", + "public/testdata/exampleBAM.bam", " " ); List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } @@ -63,13 +63,13 @@ public class ListFileUtilsUnitTest extends BaseTest { public void testCommentSupportInBAMListFiles() throws Exception { File tempListFile = createTempListFile("testCommentSupport", "#", - "testdata/exampleBAM.bam", - "#testdata/foo.bam", - " # testdata/bar.bam" + "public/testdata/exampleBAM.bam", + "#public/testdata/foo.bam", + " # public/testdata/bar.bam" ); List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java index df4e1660d..78ab916db 100644 --- a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java @@ -29,7 +29,7 @@ public class GenomeLocProcessingTrackerUnitTest extends BaseTest { IndexedFastaSequenceFile fasta = null; GenomeLocParser genomeLocParser = null; String chr1 = null; - private final static String FILE_ROOT = "testdata/GLPTFile"; + private final static String FILE_ROOT = "public/testdata/GLPTFile"; @BeforeTest public void before() { diff --git a/public/scala/qscript/core/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala similarity index 95% rename from public/scala/qscript/core/DataProcessingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index cad19bd23..f9369ee3f 100755 --- a/public/scala/qscript/core/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -1,15 +1,15 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord} - import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ +import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.SAMFileHeader.SortOrder class DataProcessingPipeline extends QScript { @@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript { @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) var useBWApe: Boolean = false + @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) + var bwaThreads: Int = 1 /**************************************************************************** @@ -180,6 +182,7 @@ class DataProcessingPipeline extends QScript { var realignedBams: List[File] = List() var index = 1 for (bam <- bams) { + val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" ) val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") @@ -190,11 +193,12 @@ class DataProcessingPipeline extends QScript { bwa_sam_se(bam, saiFile1, realignedSamFile)) } else { - add(bwa_aln_pe(bam, saiFile1, 1), - bwa_aln_pe(bam, saiFile2, 2), - bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile)) + add(sortSam(bam, readSortedBam, SortOrder.queryname), + bwa_aln_pe(readSortedBam, saiFile1, 1), + bwa_aln_pe(readSortedBam, saiFile2, 2), + bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile)) } - add(sortSam(realignedSamFile, realignedBamFile)) + add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate)) addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam)) realignedBams :+= rgRealignedBamFile index = index + 1 @@ -385,9 +389,10 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".joinBams" } - case class sortSam (inSam: File, outBam: File) extends SortSam { + case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam { this.input = List(inSam) this.output = outBam + this.sortOrder = sortOrderP this.memoryLimit = 4 this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".sortSam" @@ -430,7 +435,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai - def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai this.analysisName = queueLogDir + outSai + ".bwa_aln_se" this.jobName = queueLogDir + outSai + ".bwa_aln_se" } @@ -438,7 +443,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file for 1st mating pair") var sai = outSai1 - def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1" this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } diff --git a/public/scala/qscript/core/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala similarity index 99% rename from public/scala/qscript/core/GATKResourcesBundle.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ae045d3..150d78019 100755 --- a/public/scala/qscript/core/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala similarity index 99% rename from public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 632c03499..d71c6ae5d 100755 --- a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts + import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala new file mode 100755 index 000000000..678b356ee --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -0,0 +1,91 @@ +package org.broadinstitute.sting.queue.qscripts + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import net.sf.samtools.SAMFileReader + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 4/20/11 + * Time: 16:29 PM + */ + + +class RecalibrateBaseQualities extends QScript { + + @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) + var GATKjar: File = _ + + @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + var input: File = _ + + @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) + var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") + + @Input(doc="Reference fasta file", shortName="R", required=false) + var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + + @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) + var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + + val queueLogDir: String = ".qlog/" + var nContigs: Int = 0 + + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + return samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + def script = { + + nContigs = getNumberOfContigs(input) + + val recalFile1: File = new File("recal1.csv") + val recalFile2: File = new File("recal2.csv") + val recalBam: File = swapExt(input, ".bam", "recal.bam") + val path1: String = "before" + val path2: String = "after" + + add(cov(input, recalFile1), + recal(input, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = GATKjar + this.reference_sequence = reference + this.memoryLimit = 4 + this.isIntermediate = true + } + + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + this.input_file :+= inBam + this.recal_file = outRecalFile + this.analysisName = queueLogDir + outRecalFile + ".covariates" + this.jobName = queueLogDir + outRecalFile + ".covariates" + this.scatterCount = nContigs + } + + case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { + this.input_file :+= inBam + this.recal_file = inRecalFile + this.out = outBam + this.isIntermediate = false + this.analysisName = queueLogDir + outBam + ".recalibration" + this.jobName = queueLogDir + outBam + ".recalibration" + this.scatterCount = nContigs + } + + case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { + this.resources = R + this.recal_file = inRecalFile + this.output_dir = outPath.toString + this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" + this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" + } +} \ No newline at end of file diff --git a/public/scala/qscript/core/StandardVariantEvaluation.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala similarity index 99% rename from public/scala/qscript/core/StandardVariantEvaluation.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala index d6af7019e..d333e1dc0 100755 --- a/public/scala/qscript/core/StandardVariantEvaluation.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk.RodBind diff --git a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala b/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala deleted file mode 100755 index 7fe249beb..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala +++ /dev/null @@ -1,211 +0,0 @@ -package org.broadinstitute.sting.scala -/** -import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall} -import java.io.File -import net.sf.samtools.SAMRecord -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext -import org.broadinstitute.sting.utils.Pair -import org.broadinstitute.sting.utils.genotype.GenotypeMetaData -import utils._ -import cmdLine.Argument - -class TransitionTable() { - var table: Array[Array[Double]] = new Array[Array[Double]](4,4) - - def makeKey(b: Char): Int = { - return BaseUtils.simpleBaseToBaseIndex(b) - } - - def fromKey(i: Int): Char = { - return BaseUtils.baseIndexToSimpleBase(i) - } - - def add(ref: Char, subst: Char) = { - table(makeKey(ref))(makeKey(subst)) += 1 - } - - def get(i: Int, j: Int): Double = { - return table(i)(j) - } - - def set(i: Int, j: Int, d: Double) = { - //printf("Setting %d / %d => %f%n", i, j, d) - table(i)(j) = d - } - - def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = { - return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j)) - } - - def header(): String = { - //return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray) - return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray) - } - - override def toString(): String = { - return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray) - } - - def sum(): Double = { - return sumList(table.toList map (x => sumList(x.toList))) - } - - def sumList(xs: List[Double]): Double = { - return (0.0 :: xs) reduceLeft {(x, y) => x + y} - } - - def getRateMatrix(): TransitionTable = { - var sums = (table.toList map (x => sumList(x.toList))).toArray - var t = new TransitionTable() - - //println(sums.toList) - this mapEntries ((i,j,x) => t.set(i, j, x / sums(i))) - - return t - } -} - -**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] { - private var MIN_MAPPING_QUALITY = 30 - private var MIN_BASE_QUALITY = 20 - private var MIN_LOD = 5 - private var PRINT_FREQ = 1000000 - private var CARE_ABOUT_STRAND = true - - private val SSG = new UnifiedGenotyper() - private val table1 = new TransitionTable() - private val tableFWD = new TransitionTable() - private val tableREV = new TransitionTable() - private val table2 = new TransitionTable() - - @Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false } - var VERBOSE: Boolean = false - - override def initialize() { - SSG.initialize(); - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = { - val callPair = SSG.map(tracker, ref, context) - //val call = callPair.getFirst().get(0) - val pileup = new ReadBackedPileup(ref.getBase(), context) - - def hasNoNs(): Boolean = { - return ! (pileup.getBases() exists ('N' ==)) - } - - if ( isDefinitelyHomRef(callPair) && hasNoNs() ) { - var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases) - - //printf("nonRefBases %s%n", nonRefBases) - if ( nonRefBases.length > 0 ) { - var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup) - var nonRefRead: SAMRecord = nonRefReads.head - - nonRefBases.length match { - case 1 => - //if ( goodRead(nonRefRead,offsets.head) ) { - //printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //} - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1) - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD ) - case 2 => - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2) - addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2) - case x => - } - - if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 ) - printNonRefBases(ref, nonRefReads, offsets, context.getReads.size()) - } - } - } - - implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*) - - def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = { - out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth) - } - - def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = { - if ( goodRead(nonRefRead, offset) ) { - //printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality) - if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) { - // it's on the reverse strand - val refComp: Char = BaseUtils.simpleComplement(ref.getBase) - val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase) - - //printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp) - table.add(refComp, nonRefComp) - } else { - table.add(ref.getBase, nonRefBase) - } - } - } - - def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = { - def getBase(i: Int): Char = { - var offset = pileup.getOffsets().get(i) - return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int]) - } - - var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i - var reads = subset map (i => pileup.getReads().get(i)) - var offsets = subset map (i => pileup.getOffsets().get(i)) - - //println(reads, offsets, subset) - return (reads, offsets) - } - - - def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = { - splitBases(refBase, pileup) match { - case (refs, nonRefs) => - //printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString) - return nonRefs - } - } - - def goodRead( read: SAMRecord, offset: Integer ): Boolean = { - return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY - } - - def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = { - val refBases = pileup.toList filter (refBase ==) - val nonRefBases = pileup.toList filter (refBase !=) - return (refBases, nonRefBases) - } - - def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = { - if ( call == null ) - return false; - - return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Unit, r: Int): Int = { - return r; - } - - override def onTraversalDone(r: Int) = { - out.println("-------------------- FINAL RESULT --------------------") - out.println("type\t" + table1.header) - def print1(t: String, x: TransitionTable) = { - out.println(t + "\t" + x) - } - - print1("1-mismatch", table1) - print1("2-mismatch", table2) - print1("1-mismatch-fwd", tableFWD) - print1("1-mismatch-rev", tableREV) - } -} -*/ \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala b/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala deleted file mode 100755 index fa30fceff..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.scala - -import java.io.PrintStream -import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext} -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import collection.JavaConversions._ -import collection.immutable.List -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature -import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker} -import org.broadinstitute.sting.commandline.{Output, Argument} -import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc} -import collection.mutable.{ListBuffer, HashSet} -import java.lang.Math - -class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] { - @Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb") - var minPropOverlap : Double = 0.50 // default to 50% - @Output var out : PrintStream = _ - - override def reduceInit : List[IntervalInfoBuilder] = { - Nil - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = { - val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList - val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList - val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList - var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase) - if ( refSeqData != null ) { - amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature]) - } - if ( tableMetaData != null ) { - amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) => - new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _))) - } - amd.newIvals = ivalsOfInterest.map(_.getLocation) - return amd - } - - override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = { - rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done))) - val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) ::: - mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) ) - newList.foreach(_.update(mapMetaData)) - return newList - } - - override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = { - finalList.foreach(u => out.print("%s%n".format(u.done))) - } - -} - -class AnnotationMetaData(loc: GenomeLoc, ref: Byte) { - val locus : GenomeLoc = loc - var newIvals : List[GenomeLoc] = Nil - var refSeqFeatures : List[RefSeqFeature] = Nil - var tableFeatures : List[(GenomeLoc,String)] = Nil - val refBase : Byte = ref -} - -class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) { - - val OVERLAP_PROP : Double = minProp - - var metaData : HashSet[String] = new HashSet[String] - var seenTranscripts : HashSet[String] = new HashSet[String] - var baseContent : ListBuffer[Byte] = new ListBuffer[Byte] - var location : GenomeLoc = loc - var gcContent : Double = _ - var entropy : Double = _ - var finalized : Boolean = false - - - def update(mmd : AnnotationMetaData) = { - baseContent += mmd.refBase - mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u)) - mmd.tableFeatures.filter( (t : (GenomeLoc,String)) => - ! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2)) - } - - def addRefSeq( rs: RefSeqFeature ) : Unit = { - seenTranscripts.add(rs.getTranscriptUniqueGeneName) - val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp ) - if ( exons.size > 0 ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _))) - } else { - if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR")) - } else { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron")) - } - } - - } - - def done : String = { - finalized = true - def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 } - gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size() - entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b))) - val meta : String = metaData.reduceLeft(_ + "\t" + _) - return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta) - } - - def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = { - if(byteList.size == 1) return 0 - Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) + - calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => { - if ( b.equals(byteList(1)) ) { - a.dropRight(1) :+ (a.last ++ b) - } else { - a :+ b - } - })) - } - -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala b/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala deleted file mode 100755 index 658aabc18..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.scala - -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext - -class ScalaCountLoci extends LocusWalker[Int,Int] { - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = { - return 1 - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Int, r: Int): Int = { - return m + r; - } - - def main(args: Array[String]) { - println("Hello, world!") - } -}