Merge branch 'master' of ssh://delangel@nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
c6c0dba040
|
|
@ -64,7 +64,7 @@
|
|||
<property name="usefile" value="false" />
|
||||
|
||||
<!-- If running the 'package' task, this property controls the name of the xml file to package -->
|
||||
<property name="package.xml.dir" value="packages" />
|
||||
<property name="package.xml.dir" value="${public.dir}/packages" />
|
||||
<property name="executable" value="GenomeAnalysisTK" />
|
||||
|
||||
<property environment="env"/>
|
||||
|
|
@ -896,7 +896,7 @@
|
|||
<!-- Build a package consisting of all supporting files -->
|
||||
<target name="package" depends="dist,stage" description="bundle up an executable for distribution">
|
||||
<mkdir dir="dist/packages" />
|
||||
<xslt destdir="dist/packages" style="packages/CreatePackager.xsl" useImplicitFileset="false">
|
||||
<xslt destdir="dist/packages" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
|
||||
<flattenmapper/>
|
||||
<fileset dir="${package.xml.dir}">
|
||||
<include name="${executable}.xml" />
|
||||
|
|
@ -934,7 +934,7 @@
|
|||
|
||||
<!-- Use the packaging system to extract parts of picard-private we need -->
|
||||
<mkdir dir="${dist.dir}/packages" />
|
||||
<xslt in="packages/PicardPrivate.xml" out="${dist.dir}/packages/BuildPicardPrivate.xml" style="packages/CreatePackager.xsl" />
|
||||
<xslt in="${package.xml.dir}/PicardPrivate.xml" out="${dist.dir}/packages/BuildPicardPrivate.xml" style="${package.xml.dir}/CreatePackager.xsl" />
|
||||
<ant antfile="${dist.dir}/packages/BuildPicardPrivate.xml">
|
||||
<property name="additional.jars" value="${required.picard.jars}" />
|
||||
</ant>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,87 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
tranchesFile = args[1]
|
||||
targetTITV = as.numeric(args[2])
|
||||
targetSensitivity = as.numeric(args[3])
|
||||
suppressLegend = ! is.na(args[4])
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Useful general routines
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
|
||||
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
|
||||
|
||||
titvFPEst <- function(titvExpected, titvObserved) {
|
||||
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
|
||||
}
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
nTPFP <- function(nVariants, FDR) {
|
||||
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
|
||||
}
|
||||
|
||||
leftShift <- function(x, leftValue = 0) {
|
||||
r = rep(leftValue, length(x))
|
||||
for ( i in 1:(length(x)-1) ) {
|
||||
#print(list(i=i))
|
||||
r[i] = x[i+1]
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
# Tranches plot
|
||||
# -----------------------------------------------------------------------------------------------
|
||||
data2 = read.table(tranchesFile,sep=",",head=T)
|
||||
data2 = data2[order(data2$novelTiTv, decreasing=F),]
|
||||
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
|
||||
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
|
||||
density=c(20, -1, -1, 20)
|
||||
outfile = paste(tranchesFile, ".pdf", sep="")
|
||||
pdf(outfile, height=5, width=8)
|
||||
par(mar = c(5, 5, 4, 2) + 0.1)
|
||||
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
|
||||
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
|
||||
#print(alpha)
|
||||
|
||||
numGood = round(alpha * data2$numNovel);
|
||||
|
||||
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
|
||||
numBad = data2$numNovel - numGood;
|
||||
|
||||
numPrevGood = leftShift(numGood, 0)
|
||||
numNewGood = numGood - numPrevGood
|
||||
numPrevBad = leftShift(numBad, 0)
|
||||
numNewBad = numBad - numPrevBad
|
||||
|
||||
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
|
||||
#print(d)
|
||||
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
|
||||
#abline(v= d[2,dim(d)[2]], lty=2)
|
||||
#abline(v= d[1,3], lty=2)
|
||||
if ( ! suppressLegend )
|
||||
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
|
||||
|
||||
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
|
||||
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
|
||||
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
|
||||
|
||||
# plot sensitivity vs. specificity
|
||||
sensitivity = data2$truthSensitivity
|
||||
if ( ! is.null(sensitivity) ) {
|
||||
#specificity = titvFPEstV(targetTITV, novelTiTv)
|
||||
specificity = novelTiTv
|
||||
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
|
||||
abline(h=targetTITV, lty=2)
|
||||
abline(v=targetSensitivity, lty=2)
|
||||
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
|
||||
}
|
||||
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
|
||||
d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical-c$Qreported
|
||||
c$residualError[which(c$residualError > 10)] = 10
|
||||
c$residualError[which(c$residualError < -10)] = -10
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
|
||||
lwdSize=2
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
lwdSize=7
|
||||
} else if( length(c$Covariate) <= 70 ) {
|
||||
lwdSize=4
|
||||
}
|
||||
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
if( length(hst$e.Covariate) == 0 ) {
|
||||
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
} else {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
|
||||
if( length(hst$c.Covariate) > 9 ) {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
} else {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
Qcutoff = as.numeric(args[2])
|
||||
maxQ = as.numeric(args[3])
|
||||
maxHist = as.numeric(args[4])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
|
||||
#
|
||||
# Plot of reported quality versus empirical quality
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
||||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q empirical histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
|
||||
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
|
||||
entropy = -sum(log2(percentBases)*percentBases)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
|
||||
|
||||
titvFPEstV <- function(titvExpected, titvs) {
|
||||
sapply(titvs, function(x) titvFPEst(titvExpected, x))
|
||||
}
|
||||
|
||||
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
|
||||
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
|
||||
2 * TP / 3 / callable
|
||||
}
|
||||
|
||||
marginalTiTv <- function( nx, titvx, ny, titvy ) {
|
||||
tvx = nx / (titvx + 1)
|
||||
tix = nx - tvx
|
||||
tvy = ny / (titvy + 1)
|
||||
tiy = ny - tvy
|
||||
tiz = tix - tiy
|
||||
tvz = tvx - tvy
|
||||
return(tiz / tvz)
|
||||
}
|
||||
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
|
||||
knownx = nx * dbx / 100
|
||||
novelx = nx - knownx
|
||||
knowny = ny * dby / 100
|
||||
novely = ny - knowny
|
||||
knownz = knownx - knowny
|
||||
novelz = novelx - novely
|
||||
return(knownz / ( knownz + novelz ) * 100)
|
||||
}
|
||||
|
||||
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
|
||||
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
|
||||
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
|
||||
}
|
||||
|
||||
normalize <- function(x) {
|
||||
x / sum(x)
|
||||
}
|
||||
|
||||
normcumsum <- function(x) {
|
||||
cumsum(normalize(x))
|
||||
}
|
||||
|
||||
cumhist <- function(d, ...) {
|
||||
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
|
||||
}
|
||||
|
||||
revcumsum <- function(x) {
|
||||
return(rev(cumsum(rev(x))))
|
||||
}
|
||||
|
||||
phred <- function(x) {
|
||||
log10(max(x,10^(-9.9)))*-10
|
||||
}
|
||||
|
||||
pOfB <- function(b, B, Q) {
|
||||
#print(paste(b, B, Q))
|
||||
p = 1 - 10^(-Q/10)
|
||||
if ( b == B )
|
||||
return(p)
|
||||
else
|
||||
return(1 - p)
|
||||
}
|
||||
|
||||
pOfG <- function(bs, qs, G) {
|
||||
a1 = G[1]
|
||||
a2 = G[2]
|
||||
|
||||
log10p = 0
|
||||
for ( i in 1:length(bs) ) {
|
||||
b = bs[i]
|
||||
q = qs[i]
|
||||
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
|
||||
log10p = log10p + log10(p1)
|
||||
}
|
||||
|
||||
return(log10p)
|
||||
}
|
||||
|
||||
pOfGs <- function(nAs, nBs, Q) {
|
||||
bs = c(rep("a", nAs), rep("t", nBs))
|
||||
qs = rep(Q, nAs + nBs)
|
||||
G1 = c("a", "a")
|
||||
G2 = c("a", "t")
|
||||
G3 = c("t", "t")
|
||||
|
||||
log10p1 = pOfG(bs, qs, G1)
|
||||
log10p2 = pOfG(bs, qs, G2)
|
||||
log10p3 = pOfG(bs, qs, G3)
|
||||
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
|
||||
|
||||
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
|
||||
}
|
||||
|
||||
QsampleExpected <- function(depth, Q) {
|
||||
weightedAvg = 0
|
||||
for ( d in 1:(depth*3) ) {
|
||||
Qsample = 0
|
||||
pOfD = dpois(d, depth)
|
||||
for ( nBs in 0:d ) {
|
||||
pOfnB = dbinom(nBs, d, 0.5)
|
||||
nAs = d - nBs
|
||||
Qsample = pOfGs(nAs, nBs, Q)$Qsample
|
||||
#Qsample = 1
|
||||
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
|
||||
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
|
||||
}
|
||||
}
|
||||
|
||||
return(weightedAvg)
|
||||
}
|
||||
|
||||
plotQsamples <- function(depths, Qs, Qmax) {
|
||||
cols = rainbow(length(Qs))
|
||||
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
|
||||
|
||||
for ( i in 1:length(Qs) ) {
|
||||
Q = Qs[i]
|
||||
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
|
||||
points(depths, y, col=cols[i], type="b")
|
||||
}
|
||||
|
||||
legend("topleft", paste("Q", Qs), fill=cols)
|
||||
}
|
||||
|
||||
pCallHetGivenDepth <- function(depth, nallelesToCall) {
|
||||
depths = 0:(2*depth)
|
||||
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
|
||||
dpois(depths,depth)*(1-pNoAllelesToCall)
|
||||
}
|
||||
|
||||
pCallHets <- function(depth, nallelesToCall) {
|
||||
sum(pCallHetGivenDepth(depth,nallelesToCall))
|
||||
}
|
||||
|
||||
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
|
||||
1-(1-pCallHets(depth,nallelesToCall))^nsamples
|
||||
}
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Jan 15, 2010
|
||||
* @help.summary Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
|
||||
*/
|
||||
|
||||
public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false)
|
||||
private String OUTPUT_PREFIX = "analyzeAnnotations/";
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false)
|
||||
private int MIN_VARIANTS_PER_BIN = 1000;
|
||||
@Argument(fullName = "max_variants_per_bin", shortName = "maxBinSize", doc = "The maximum number of variants in a bin.", required = false)
|
||||
private int MAX_VARIANTS_PER_BIN = 20000;
|
||||
@Argument(fullName = "sampleName", shortName = "sampleName", doc = "If supplied, only process variants found in this sample.", required = false)
|
||||
private String SAMPLE_NAME = null;
|
||||
@Argument(fullName = "name", shortName = "name", doc = "Labels for the annotations to make plots look nicer. Each name is a separate -name argument. For example, -name DP,Depth -name AB,AlleleBalance", required = false)
|
||||
private String[] ANNOTATION_NAMES = null;
|
||||
@Argument(fullName = "indicate_mean_num_vars", shortName = "meanNumVars", doc = "If supplied, plots will indicate the distribution of number of variants instead of distribution of value of annotation", required = false)
|
||||
private boolean INDICATE_MEAN_NUM_VARS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private AnnotationDataManager dataManager;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
|
||||
// Create a HashMap associating the names of the annotations to full Strings that can be used as labels on plots
|
||||
HashMap<String,String> nameMap = null;
|
||||
if( ANNOTATION_NAMES != null ) {
|
||||
nameMap = new HashMap<String,String>();
|
||||
for( String nameLine : ANNOTATION_NAMES ) {
|
||||
String[] vals = nameLine.split(",");
|
||||
nameMap.put(vals[0],vals[1]);
|
||||
}
|
||||
}
|
||||
dataManager = new AnnotationDataManager( nameMap, INDICATE_MEAN_NUM_VARS );
|
||||
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker != null ) {
|
||||
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref);
|
||||
|
||||
// First find out if this variant is in the truth sets
|
||||
boolean isInTruthSet = false;
|
||||
boolean isTrueVariant = false;
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if( vc != null && vc.isSNP() && !vc.isFiltered() ) {
|
||||
if( vc.getSource().toUpperCase().startsWith("TRUTH") ) {
|
||||
isInTruthSet = true;
|
||||
if( vc.isBiallelic() && vc.isVariant() ) {
|
||||
isTrueVariant = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add each annotation in this VCF Record to the dataManager
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if( vc != null && vc.isSNP() && vc.isBiallelic() && !vc.isFiltered() ) {
|
||||
if( !vc.getSource().toUpperCase().startsWith("TRUTH") ) {
|
||||
if( vc.isVariant() ) {
|
||||
dataManager.addAnnotations( vc, SAMPLE_NAME, isInTruthSet, isTrueVariant );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1; // This value isn't actually used for anything
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
||||
public Integer reduce( Integer value, Integer sum ) {
|
||||
return 0; // Nothing to do here
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer sum ) {
|
||||
|
||||
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
|
||||
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_PREFIX, MIN_VARIANTS_PER_BIN, MAX_VARIANTS_PER_BIN);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,169 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 18, 2010
|
||||
*/
|
||||
|
||||
public class AnnotationDataManager {
|
||||
|
||||
private final HashMap<String, TreeSet<AnnotationDatum>> data;
|
||||
private final HashMap<String,String> nameMap;
|
||||
private final boolean INDICATE_MEAN_NUM_VARS;
|
||||
|
||||
public AnnotationDataManager( HashMap<String,String> _nameMap, boolean _INDICATE_MEAN_NUM_VARS ) {
|
||||
data = new HashMap<String, TreeSet<AnnotationDatum>>();
|
||||
nameMap = _nameMap;
|
||||
INDICATE_MEAN_NUM_VARS = _INDICATE_MEAN_NUM_VARS;
|
||||
}
|
||||
|
||||
public void addAnnotations( final VariantContext vc, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
|
||||
if( vc.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
|
||||
return;
|
||||
}
|
||||
} // else, process all samples
|
||||
|
||||
// Loop over each annotation in the vcf record
|
||||
final Map<String,Object> infoField = new HashMap<String, Object>(vc.getAttributes());
|
||||
infoField.put("QUAL", ((Double)vc.getPhredScaledQual()).toString() ); // add QUAL field to annotations
|
||||
for( Map.Entry<String, Object> annotation : infoField.entrySet() ) {
|
||||
|
||||
float value;
|
||||
try {
|
||||
value = Float.parseFloat( annotation.getValue().toString() );
|
||||
} catch( NumberFormatException e ) {
|
||||
continue; // Skip over annotations that aren't floats, like "DB"
|
||||
}
|
||||
|
||||
TreeSet<AnnotationDatum> treeSet = data.get( annotation.getKey() );
|
||||
if( treeSet == null ) { // This annotation hasn't been seen before
|
||||
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
|
||||
data.put( annotation.getKey(), treeSet );
|
||||
}
|
||||
AnnotationDatum datum = new AnnotationDatum( value );
|
||||
if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present
|
||||
datum = treeSet.tailSet(datum).first();
|
||||
} else {
|
||||
treeSet.add(datum);
|
||||
}
|
||||
|
||||
final boolean isNovelVariant = !vc.hasID() || !vc.getID().contains("rs");
|
||||
|
||||
// Decide if the variant is a transition or transversion
|
||||
if( VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0 ) {
|
||||
datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant );
|
||||
} else {
|
||||
datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_PREFIX,
|
||||
final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
|
||||
|
||||
final AnnotationDatum thisAnnotationBin = new AnnotationDatum();
|
||||
System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" );
|
||||
|
||||
// For each annotation we've seen
|
||||
for( final String annotationKey : data.keySet() ) {
|
||||
|
||||
String filename = OUTPUT_PREFIX + annotationKey + ".dat";
|
||||
PrintStream output;
|
||||
try {
|
||||
output = new PrintStream(filename); // Create the intermediate data file for this annotation
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(new File(filename), "Can't create intermediate output annotation data file. Does the output directory exist?", e);
|
||||
}
|
||||
|
||||
// Output a header line
|
||||
output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory");
|
||||
|
||||
// Bin SNPs and calculate truth metrics for each bin
|
||||
thisAnnotationBin.clearBin();
|
||||
for( final AnnotationDatum datum : data.get( annotationKey ) ) {
|
||||
thisAnnotationBin.combine( datum );
|
||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
|
||||
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
|
||||
thisAnnotationBin.clearBin();
|
||||
}
|
||||
// else, continue accumulating variants because this bin isn't full yet
|
||||
}
|
||||
|
||||
// One final bin that may not have been dumped out
|
||||
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) {
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
|
||||
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
|
||||
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
|
||||
thisAnnotationBin.clearBin();
|
||||
}
|
||||
|
||||
// Close the PrintStream
|
||||
output.close();
|
||||
|
||||
String annotationName = null;
|
||||
if( nameMap != null ) {
|
||||
annotationName = nameMap.get(annotationKey);
|
||||
}
|
||||
if( annotationName == null ) { // This annotation is not in the name map so use the key instead
|
||||
annotationName = annotationKey;
|
||||
}
|
||||
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
|
||||
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN + " " + INDICATE_MEAN_NUM_VARS;
|
||||
System.out.println( rScriptCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Runtime.getRuntime().exec( rScriptCommandLine );
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CannotExecuteRScript( rScriptCommandLine, e );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,170 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 18, 2010
|
||||
*/
|
||||
|
||||
public class AnnotationDatum implements Comparator<AnnotationDatum> {
|
||||
|
||||
public float value;
|
||||
private final int[] ti;
|
||||
private final int[] tv;
|
||||
|
||||
public static final int FULL_SET = 0;
|
||||
public static final int NOVEL_SET = 1;
|
||||
public static final int DBSNP_SET = 2;
|
||||
public static final int TRUTH_SET = 3;
|
||||
public static final int TRUE_POSITIVE = 4;
|
||||
private static final int NUM_SETS = 5;
|
||||
|
||||
public AnnotationDatum() {
|
||||
|
||||
value = 0.0f;
|
||||
ti = new int[NUM_SETS];
|
||||
tv = new int[NUM_SETS];
|
||||
for( int iii = 0; iii < NUM_SETS; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public AnnotationDatum( final float _value ) {
|
||||
|
||||
value = _value;
|
||||
ti = new int[NUM_SETS];
|
||||
tv = new int[NUM_SETS];
|
||||
for( int iii = 0; iii < NUM_SETS; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
ti[FULL_SET]++;
|
||||
if( isNovelVariant ) {
|
||||
ti[NOVEL_SET]++;
|
||||
} else { // Is known, in DBsnp
|
||||
ti[DBSNP_SET]++;
|
||||
}
|
||||
if( isInTruthSet ) {
|
||||
ti[TRUTH_SET]++;
|
||||
if( isTrueVariant ) {
|
||||
ti[TRUE_POSITIVE]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
|
||||
|
||||
tv[FULL_SET]++;
|
||||
if( isNovelVariant ) {
|
||||
tv[NOVEL_SET]++;
|
||||
} else { // Is known, in DBsnp
|
||||
tv[DBSNP_SET]++;
|
||||
}
|
||||
if( isInTruthSet ) {
|
||||
tv[TRUTH_SET]++;
|
||||
if( isTrueVariant ) {
|
||||
tv[TRUE_POSITIVE]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final public void combine( final AnnotationDatum that ) {
|
||||
|
||||
for( int iii = 0; iii < NUM_SETS; iii++ ) {
|
||||
this.ti[iii] += that.ti[iii];
|
||||
this.tv[iii] += that.tv[iii];
|
||||
}
|
||||
this.value = that.value; // Overwrite this bin's value
|
||||
}
|
||||
|
||||
final public float calcTiTv( final int INDEX ) {
|
||||
|
||||
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
|
||||
throw new ReviewedStingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
|
||||
}
|
||||
|
||||
if( tv[INDEX] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
|
||||
}
|
||||
|
||||
final public float calcDBsnpRate() {
|
||||
|
||||
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
|
||||
((float) ti[FULL_SET] + tv[FULL_SET]);
|
||||
}
|
||||
|
||||
final public float calcTPrate() {
|
||||
|
||||
if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) /
|
||||
((float) ti[TRUTH_SET] + tv[TRUTH_SET]);
|
||||
}
|
||||
|
||||
final public int numVariants( final int INDEX ) {
|
||||
return ti[INDEX] + tv[INDEX];
|
||||
}
|
||||
|
||||
final public void clearBin() {
|
||||
value = 0.0f;
|
||||
for( int iii = 0; iii < NUM_SETS; iii++ ) {
|
||||
ti[iii] = 0;
|
||||
tv[iii] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
|
||||
if( a1.value < a2.value ) { return -1; }
|
||||
if( a1.value > a2.value ) { return 1; }
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
|
||||
if( this.value < that.value ) { return -1; }
|
||||
if( this.value > that.value ) { return 1; }
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -42,5 +42,5 @@ public class LowMQ implements InfoFieldAnnotation {
|
|||
|
||||
public List<String> getKeyNames() { return Arrays.asList("LowMQ"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Integer, "3-tuple: <fraction of reads with MQ=0>,<fraction of reads with MQ<=10>,<total nubmer of reads>")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Float, "3-tuple: <fraction of reads with MQ=0>,<fraction of reads with MQ<=10>,<total number of reads>")); }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1011,7 +1011,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
sb.append((char)b);
|
||||
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
|
||||
} else {
|
||||
throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed");
|
||||
throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows");
|
||||
}
|
||||
|
||||
if ( reference.length - refIdx > 0 )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
@By(DataSource.READS)
|
||||
// walker to count realigned reads
|
||||
public class RealignedReadCounter extends ReadWalker<Integer, Integer> {
|
||||
|
||||
public static final String ORIGINAL_CIGAR_TAG = "OC";
|
||||
public static final String ORIGINAL_POSITION_TAG = "OP";
|
||||
|
||||
@Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
|
||||
protected String intervalsFile = null;
|
||||
|
||||
// the intervals input by the user
|
||||
private Iterator<GenomeLoc> intervals = null;
|
||||
|
||||
// the current interval in the list
|
||||
private GenomeLoc currentInterval = null;
|
||||
|
||||
private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
|
||||
private boolean intervalWasUpdated = false;
|
||||
|
||||
public void initialize() {
|
||||
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
|
||||
intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
}
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
if ( currentInterval == null ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
|
||||
// hack to get around unmapped reads having screwy locations
|
||||
if ( readLoc.getStop() == 0 )
|
||||
readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
|
||||
|
||||
if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
|
||||
return 0;
|
||||
|
||||
if ( readLoc.overlapsP(currentInterval) ) {
|
||||
if ( doNotTryToClean(read) )
|
||||
return 0;
|
||||
|
||||
if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
|
||||
String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
|
||||
// deal with an old bug
|
||||
if ( read.getCigar().toString().equals(newCigar) ) {
|
||||
//System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( !intervalWasUpdated ) {
|
||||
intervalWasUpdated = true;
|
||||
updatedIntervals++;
|
||||
affectedBases += 20 + getIndelSize(read);
|
||||
}
|
||||
updatedReads++;
|
||||
|
||||
}
|
||||
} else {
|
||||
do {
|
||||
intervalWasUpdated = false;
|
||||
currentInterval = intervals.hasNext() ? intervals.next() : null;
|
||||
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private int getIndelSize(SAMRecord read) {
|
||||
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
if ( ce.getOperator() == CigarOperator.I )
|
||||
return 0;
|
||||
if ( ce.getOperator() == CigarOperator.D )
|
||||
return ce.getLength();
|
||||
}
|
||||
logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean doNotTryToClean(SAMRecord read) {
|
||||
return read.getReadUnmappedFlag() ||
|
||||
read.getNotPrimaryAlignmentFlag() ||
|
||||
read.getReadFailsVendorQualityCheckFlag() ||
|
||||
read.getMappingQuality() == 0 ||
|
||||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
|
||||
(BadMateFilter.hasBadMate(read));
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
System.out.println(updatedIntervals + " intervals were updated");
|
||||
System.out.println(updatedReads + " reads were updated");
|
||||
System.out.println(affectedBases + " bases were affected");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RefWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.List;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
|
||||
* very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
|
||||
* This was its very first use.
|
||||
*/
|
||||
public class CountIntervals extends RefWalker<Long, Long> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
|
||||
int numOverlaps = 2;
|
||||
|
||||
public Long reduceInit() {
|
||||
return 0l;
|
||||
}
|
||||
|
||||
public boolean isReduceByInterval() { return true; }
|
||||
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<GATKFeature> checkIntervals = tracker.getGATKFeatureMetaData("check",false);
|
||||
return (long) checkIntervals.size();
|
||||
}
|
||||
|
||||
public Long reduce(Long loc, Long prev) {
|
||||
if ( loc == null ) {
|
||||
return 0l;
|
||||
} else {
|
||||
return Math.max(prev,loc);
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(List<Pair<GenomeLoc,Long>> finalReduce) {
|
||||
long count = 0;
|
||||
for ( Pair<GenomeLoc,Long> g : finalReduce ) {
|
||||
if ( g.second >= numOverlaps) {
|
||||
count ++;
|
||||
}
|
||||
}
|
||||
out.printf("Number of contiguous intervals: %d",count);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Emits specific fields as dictated by the user from one or more VCF files.
|
||||
*/
|
||||
@Requires(value={})
|
||||
public class ProfileRodSystem extends RodWalker<Integer, Integer> {
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
|
||||
int nIterations = 1;
|
||||
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
|
||||
int MAX_RECORDS = -1;
|
||||
|
||||
SimpleTimer timer = new SimpleTimer("myTimer");
|
||||
|
||||
public void initialize() {
|
||||
File rodFile = getRodFile();
|
||||
|
||||
out.printf("# walltime is in seconds%n");
|
||||
out.printf("# file is %s%n", rodFile);
|
||||
out.printf("# file size is %d bytes%n", rodFile.length());
|
||||
out.printf("operation\titeration\twalltime%n");
|
||||
for ( int i = 0; i < nIterations; i++ ) {
|
||||
out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
|
||||
out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
|
||||
out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
|
||||
out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
|
||||
out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
|
||||
}
|
||||
|
||||
timer.start(); // start up timer for map itself
|
||||
}
|
||||
|
||||
private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
|
||||
|
||||
private final double readFile(File f, ReadMode mode) {
|
||||
timer.start();
|
||||
|
||||
try {
|
||||
byte[] data = new byte[100000];
|
||||
FileInputStream s = new FileInputStream(f);
|
||||
|
||||
if ( mode == ReadMode.BY_BYTE ) {
|
||||
while (true) {
|
||||
if ( s.read(data) == -1 )
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
int counter = 0;
|
||||
VCFCodec codec = new VCFCodec();
|
||||
String[] parts = new String[100000];
|
||||
AsciiLineReader lineReader = new AsciiLineReader(s);
|
||||
|
||||
if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
|
||||
codec.readHeader(lineReader);
|
||||
|
||||
while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
|
||||
String line = lineReader.readLine();
|
||||
if ( line == null )
|
||||
break;
|
||||
else if ( mode == ReadMode.BY_PARTS ) {
|
||||
ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
}
|
||||
else if ( mode == ReadMode.DECODE_LOC ) {
|
||||
codec.decodeLoc(line);
|
||||
}
|
||||
else if ( mode == ReadMode.DECODE ) {
|
||||
processOneVC((VariantContext)codec.decode(line));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return timer.getElapsedTime();
|
||||
}
|
||||
|
||||
private File getRodFile() {
|
||||
List<ReferenceOrderedDataSource> rods = this.getToolkit().getRodDataSources();
|
||||
ReferenceOrderedDataSource rod = rods.get(0);
|
||||
return rod.getFile();
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
||||
VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
|
||||
processOneVC(vc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static final void processOneVC(VariantContext vc) {
|
||||
vc.getNSamples(); // force us to parse the samples
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer sum) {
|
||||
out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.math.BigInteger;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* a walker for validating (in the style of validating pile-up) the ROD system.
|
||||
*/
|
||||
@Reference(window=@Window(start=-40,stop=40))
|
||||
public class RodSystemValidationWalker extends RodWalker<Integer,Integer> {
|
||||
|
||||
// the divider to use in some of the text output
|
||||
private static final String DIVIDER = ",";
|
||||
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
|
||||
public boolean allRecordsVariantContextEquivalent = false;
|
||||
|
||||
// used to calculate the MD5 of a file
|
||||
MessageDigest digest = null;
|
||||
|
||||
// we sometimes need to know what rods the engine's seen
|
||||
List<ReferenceOrderedDataSource> rodList;
|
||||
|
||||
/**
|
||||
* emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
|
||||
* underneath us).
|
||||
*/
|
||||
public void initialize() {
|
||||
// setup the MD5-er
|
||||
try {
|
||||
digest = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new ReviewedStingException("Unable to find MD5 checksumer");
|
||||
}
|
||||
out.println("Header:");
|
||||
// enumerate the list of ROD's we've loaded
|
||||
rodList = this.getToolkit().getRodDataSources();
|
||||
for (ReferenceOrderedDataSource rod : rodList) {
|
||||
out.println(rod.getName() + DIVIDER + rod.getType());
|
||||
out.println(rod.getName() + DIVIDER + rod.getFile());
|
||||
out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
|
||||
}
|
||||
out.println("Data:");
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tracker the ref meta data tracker to get RODs
|
||||
* @param ref reference context
|
||||
* @param context the reads
|
||||
* @return an 1 for each site with a rod(s), 0 otherwise
|
||||
*/
|
||||
@Override
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
int ret = 0;
|
||||
if (tracker != null && tracker.getAllRods().size() > 0) {
|
||||
out.print(context.getLocation() + DIVIDER);
|
||||
Collection<GATKFeature> features = tracker.getAllRods();
|
||||
for (GATKFeature feat : features)
|
||||
out.print(feat.getName() + DIVIDER);
|
||||
out.println(";");
|
||||
ret++;
|
||||
}
|
||||
|
||||
// if the argument was set, check for equivalence
|
||||
if (allRecordsVariantContextEquivalent && tracker != null) {
|
||||
Collection<VariantContext> col = tracker.getAllVariantContexts(ref);
|
||||
VariantContext con = null;
|
||||
for (VariantContext contextInList : col)
|
||||
if (con == null) con = contextInList;
|
||||
else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces a single map with the accumulator provided as the ReduceType.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return accumulator with result of the map taken into account.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
// Double check traversal result to make count is the same.
|
||||
// TODO: Is this check necessary?
|
||||
out.println("[REDUCE RESULT] Traversal result is: " + result);
|
||||
}
|
||||
|
||||
// shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
|
||||
private String md5sum(File f) {
|
||||
InputStream is;
|
||||
try {
|
||||
is = new FileInputStream(f);
|
||||
} catch (FileNotFoundException e) {
|
||||
return "Not a file";
|
||||
}
|
||||
byte[] buffer = new byte[8192];
|
||||
int read = 0;
|
||||
try {
|
||||
while ((read = is.read(buffer)) > 0) {
|
||||
digest.update(buffer, 0, read);
|
||||
}
|
||||
byte[] md5sum = digest.digest();
|
||||
BigInteger bigInt = new BigInteger(1, md5sum);
|
||||
return bigInt.toString(16);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to process file for MD5", e);
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
is.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,298 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
|
||||
* Can also count the number of reads matching a given criterion using read filters (see the
|
||||
* --read-filter command line argument). Simplest example of a read-backed analysis.
|
||||
*/
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
|
||||
@Reference(window=@Window(start=-5,stop=5))
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
|
||||
protected int maxReadLen = 1000;
|
||||
|
||||
@Argument(doc="",required=false)
|
||||
protected int bw = 7;
|
||||
|
||||
@Argument(doc="",required=false)
|
||||
protected boolean samtoolsMode = false;
|
||||
|
||||
@Argument(doc="only operates on reads with this name",required=false)
|
||||
protected String readName = null;
|
||||
|
||||
@Argument(doc="If true, all differences are errors", required=false)
|
||||
protected boolean strict = false;
|
||||
|
||||
@Argument(doc="prints info for each read", required=false)
|
||||
protected boolean printEachRead = false;
|
||||
|
||||
@Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
|
||||
protected boolean alsoPrintWarnings = false;
|
||||
|
||||
@Argument(doc="Include reads without BAQ tag", required=false)
|
||||
protected boolean includeReadsWithoutBAQTag = false;
|
||||
|
||||
@Argument(doc="x each read is processed", required=false)
|
||||
protected int magnification = 1;
|
||||
|
||||
@Argument(doc="Profile performance", required=false)
|
||||
protected boolean profile = false;
|
||||
|
||||
int counter = 0;
|
||||
|
||||
BAQ baqHMM = null; // matches current samtools parameters
|
||||
|
||||
public void initialize() {
|
||||
if ( samtoolsMode )
|
||||
baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
|
||||
else
|
||||
baqHMM = new BAQ();
|
||||
}
|
||||
|
||||
long goodReads = 0, badReads = 0;
|
||||
|
||||
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
|
||||
|
||||
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
|
||||
if ( baqHMM.excludeReadFromBAQ(read) )
|
||||
return 0;
|
||||
|
||||
if ( profile ) {
|
||||
profileBAQ(ref, read);
|
||||
} else {
|
||||
validateBAQ(ref, read);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SimpleTimer tagTimer = new SimpleTimer("from.tag");
|
||||
SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
|
||||
SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
|
||||
|
||||
private void profileBAQ(ReferenceContext ref, SAMRecord read) {
|
||||
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
|
||||
BAQ.BAQCalculationResult baq = null;
|
||||
|
||||
tagTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
|
||||
tagTimer.stop();
|
||||
|
||||
baqReadTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
|
||||
baqReadTimer.stop();
|
||||
|
||||
glocalTimer.restart();
|
||||
for ( int i = 0; i < magnification; i++ )
|
||||
baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
|
||||
glocalTimer.stop();
|
||||
}
|
||||
|
||||
|
||||
private void validateBAQ(ReferenceContext ref, SAMRecord read) {
|
||||
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
|
||||
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
|
||||
if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
|
||||
|
||||
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
|
||||
|
||||
boolean fail = false;
|
||||
boolean print = false;
|
||||
int badi = 0;
|
||||
|
||||
if ( BAQ.hasBAQTag(read) ) {
|
||||
for ( badi = 0; badi < baqFromTag.length; badi++ ) {
|
||||
if ( baqFromTag[badi] != baq.bq[badi] ) {
|
||||
if ( cigarLength(read) != read.getReadLength() ) {
|
||||
print = true;
|
||||
fail = false;
|
||||
out.printf(" different, but cigar length != read length%n");
|
||||
break;
|
||||
}
|
||||
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but Q0 base detected%n");
|
||||
break;
|
||||
}
|
||||
else if (readHasSoftClip(read) && ! samtoolsMode) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but soft clip detected%n");
|
||||
break;
|
||||
} else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
|
||||
print = true;
|
||||
fail = strict;
|
||||
out.printf(" different, but deletion detected%n");
|
||||
break;
|
||||
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
|
||||
print = fail = true;
|
||||
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
|
||||
break;
|
||||
} else {
|
||||
print = fail = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( fail || print )
|
||||
badReads++;
|
||||
else
|
||||
goodReads++;
|
||||
}
|
||||
|
||||
if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
|
||||
byte[] pos = new byte[baq.bq.length];
|
||||
for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
|
||||
|
||||
out.printf(" read length : %d%n", read.getReadLength());
|
||||
out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
|
||||
out.printf(" cigar : %s%n", read.getCigarString());
|
||||
out.printf(" ref bases : %s%n", new String(baq.refBases));
|
||||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.printf(" ref length : %d%n", baq.refBases.length);
|
||||
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
|
||||
if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true);
|
||||
printQuals(" original quals: ", read.getBaseQualities(), true);
|
||||
printQuals(" baq quals: ", baq.bq, true);
|
||||
printQuals(" positions : ", pos, true);
|
||||
printQuals(" original quals: ", read.getBaseQualities());
|
||||
if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
|
||||
printQuals(" hmm quals: ", baq.bq);
|
||||
out.printf(" read bases : %s%n", new String(read.getReadBases()));
|
||||
out.println(Utils.dupString('-', 80));
|
||||
}
|
||||
|
||||
|
||||
if ( fail )
|
||||
throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
|
||||
read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
|
||||
}
|
||||
|
||||
private final static boolean readHasSoftClip(SAMRecord read) {
|
||||
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||
if ( e.getOperator() == CigarOperator.SOFT_CLIP )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private final static boolean readHasDeletion(SAMRecord read) {
|
||||
for (CigarElement e : read.getCigar().getCigarElements()) {
|
||||
if ( e.getOperator() == CigarOperator.DELETION )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public final void printQuals( String prefix, byte[] quals ) {
|
||||
printQuals(prefix, quals, false);
|
||||
}
|
||||
|
||||
public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
|
||||
printQuals(out, prefix, quals, asInt);
|
||||
}
|
||||
|
||||
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
|
||||
out.print(prefix);
|
||||
for ( int i = 0; i < quals.length; i++) {
|
||||
if ( asInt ) {
|
||||
out.printf("%2d", (int)quals[i]);
|
||||
if ( i+1 != quals.length ) out.print(",");
|
||||
} else
|
||||
out.print((char)(quals[i]+33));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
|
||||
* @param read
|
||||
* @return
|
||||
*/
|
||||
public static byte[] getBAQDeltas(SAMRecord read) {
|
||||
byte[] baq = BAQ.getBAQTag(read);
|
||||
if ( baq != null ) {
|
||||
byte[] deltas = new byte[baq.length];
|
||||
for ( int i = 0; i < deltas.length; i++)
|
||||
deltas[i] = (byte)(-1 * (baq[i] - 64));
|
||||
return deltas;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
private int cigarLength(SAMRecord read) {
|
||||
int readI = 0;
|
||||
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
|
||||
int l = elt.getLength();
|
||||
switch (elt.getOperator()) {
|
||||
case N: // cannot handle these
|
||||
return 0;
|
||||
case H : case P : // ignore pads and hard clips
|
||||
break;
|
||||
case S :
|
||||
case I :
|
||||
readI += l;
|
||||
break;
|
||||
case D : break;
|
||||
case M :
|
||||
readI += l;
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
|
||||
}
|
||||
}
|
||||
return readI;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer nreads) {
|
||||
if ( profile ) {
|
||||
out.printf("n.reads baq.per.read calculation time.in.secs%n");
|
||||
printTimer(nreads, tagTimer);
|
||||
printTimer(nreads, glocalTimer);
|
||||
printTimer(nreads, baqReadTimer);
|
||||
} else {
|
||||
out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
|
||||
goodReads, (100.0 * goodReads) / nreads,
|
||||
badReads, (100.0 * badReads) / nreads);
|
||||
}
|
||||
}
|
||||
|
||||
private final void printTimer(int nreads, SimpleTimer timer) {
|
||||
out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -55,7 +55,6 @@ import java.util.*;
|
|||
|
||||
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
|
|
@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
final private NestedHashMap lodMap = new NestedHashMap();
|
||||
final private NestedHashMap annotationMap = new NestedHashMap();
|
||||
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
|
|
@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
for ( final String line : new XReadLines( RECAL_FILE ) ) {
|
||||
final String[] vals = line.split(",");
|
||||
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
|
||||
annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
|
||||
|
|
@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
String filterString = null;
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
|
||||
final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
|
||||
if( lod == null ) {
|
||||
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
|
||||
}
|
||||
|
||||
// Annotate the new record with its VQSLOD and the worst performing annotation
|
||||
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||
attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
|
||||
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
final Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
|
|
|
|||
|
|
@ -190,6 +190,19 @@ public class GaussianMixtureModel {
|
|||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
|
||||
if(datum.isNull[iii]) { return null; }
|
||||
|
||||
final Normal normal = new Normal(0.0, 1.0, null);
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numVals = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
|
|
|
|||
|
|
@ -254,7 +254,9 @@ public class VariantDataManager {
|
|||
|
||||
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod));
|
||||
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
|
||||
datum.contig, datum.start, datum.stop, datum.lod,
|
||||
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
|
|||
public String contig;
|
||||
public int start;
|
||||
public int stop;
|
||||
public int worstAnnotation;
|
||||
public MultivariateGaussian assignment; // used in K-means implementation
|
||||
|
||||
public int compareTo( final VariantDatum other ) {
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ import java.util.*;
|
|||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
public static final String CULPRIT_KEY = "culprit";
|
||||
|
||||
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
|
||||
|
||||
|
|
@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
||||
|
||||
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,23 @@ public class VariantRecalibratorEngine {
|
|||
}
|
||||
}
|
||||
|
||||
public void calculateWorstPerformingAnnotation( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
int worstAnnotation = -1;
|
||||
double minProb = Double.MAX_VALUE;
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
|
||||
final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
|
||||
if( goodProbLog10 != null && badProbLog10 != null ) {
|
||||
final double prob = goodProbLog10 - badProbLog10;
|
||||
if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
|
||||
}
|
||||
}
|
||||
datum.worstAnnotation = worstAnnotation;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for generating a GaussianMixtureModel
|
||||
/////////////////////////////
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.datasources.pipeline;
|
||||
package org.broadinstitute.sting.pipeline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
|
@ -22,7 +22,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.datasources.pipeline;
|
||||
package org.broadinstitute.sting.pipeline;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
|
@ -22,7 +22,7 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.datasources.pipeline;
|
||||
package org.broadinstitute.sting.pipeline;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
|
@ -1,149 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.apache.commons.io.filefilter.RegexFileFilter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class PicardAggregationUtils {
|
||||
public static final String PICARD_AGGREGATION_DIR = "/seq/picard_aggregation/";
|
||||
|
||||
/**
|
||||
* Returns the path to the sample BAM.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @param version Version
|
||||
* @return The path to the sample BAM.
|
||||
*/
|
||||
public static String getSampleBam(String project, String sample, int version) {
|
||||
return getSampleDir(project, sample, version) + sample + ".bam";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the path to the latest BAM.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @return The path to the latest BAM.
|
||||
* @throws FileNotFoundException If a finished directory cannot be found for a sample.
|
||||
*/
|
||||
public static String getSampleBam(String project, String sample) throws FileNotFoundException {
|
||||
return getSampleDir(project, sample) + sample + ".bam";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the sample directory.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @param version Version
|
||||
* @return the sample directory.
|
||||
*/
|
||||
public static String getSampleDir(String project, String sample, int version) {
|
||||
return PICARD_AGGREGATION_DIR + String.format("%s/%s/v%d/", project, sample, version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the latest finished directory for this project sample.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @return The path to the latest finished directory.
|
||||
* @throws FileNotFoundException If a finished directory cannot be found for a sample.
|
||||
*/
|
||||
public static String getSampleDir(String project, String sample) throws FileNotFoundException {
|
||||
int latestVersion = getLatestVersion(project, sample);
|
||||
if (latestVersion == 0)
|
||||
throw new FileNotFoundException("Unable to find a finished directory for project sample " + project + "/" + sample);
|
||||
return getSampleDir(project, sample, latestVersion);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the latest finished version directory.
|
||||
* Because isilon metadata operations are relatively slow this method
|
||||
* tries not to look for every possible versioned directory.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @return The highest finished version directory or 0 if a finished directory was not found.
|
||||
*/
|
||||
public static int getLatestVersion(String project, String sample) {
|
||||
return getLatestVersion(project, sample, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the latest finished version directory after startVersion.
|
||||
* Because isilon metadata operations are relatively slow this method
|
||||
* tries not to look for every possible versioned directory.
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @param startVersion minimum version to return
|
||||
* @return The highest finished version directory after startVersion
|
||||
*/
|
||||
public static int getLatestVersion(String project, String sample, int startVersion) {
|
||||
int version = Math.max(0, startVersion);
|
||||
boolean nextExists = true;
|
||||
while (nextExists) {
|
||||
nextExists = false;
|
||||
for (int next = 3; next > 0; next--)
|
||||
if (isFinished(project, sample, version + next)) {
|
||||
version += next;
|
||||
nextExists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Special case when the version is 0
|
||||
// Because isilon storage takes a while to do meta data operations only look through every file if we have to.
|
||||
if (version == 0) {
|
||||
File sampleDir = new File(PICARD_AGGREGATION_DIR + project + "/" + sample);
|
||||
if (sampleDir.exists()) {
|
||||
FileFilter filter = new RegexFileFilter("v\\d+");
|
||||
File[] files = sampleDir.listFiles(filter);
|
||||
int[] versions = new int[files.length];
|
||||
for (int i = 0; i < files.length; i++)
|
||||
versions[i] = Integer.parseInt(files[i].getName().substring(1));
|
||||
Arrays.sort(versions);
|
||||
for (int i = versions.length - 1; i >= 0; i--) {
|
||||
if (isFinished(project, sample, versions[i])) {
|
||||
version = versions[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return version == 0 ? startVersion : version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the project sample directory contains a finished.txt
|
||||
* @param project Project
|
||||
* @param sample Sample
|
||||
* @param version Version
|
||||
* @return true if the project sample directory contains a finished.txt
|
||||
*/
|
||||
public static boolean isFinished(String project, String sample, int version) {
|
||||
return new File(getSampleDir(project, sample, version), "finished.txt").exists();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class PicardAnalysisFiles {
|
||||
private static final String REFERENCE_SEQUENCE_HEADER = "REFERENCE_SEQUENCE";
|
||||
private static final String TARGET_INTERVALS_HEADER = "TARGET_INTERVALS";
|
||||
private static final String BAIT_INTERVALS_HEADER = "BAIT_INTERVALS";
|
||||
private static final String[] ANALYSIS_HEADERS = {REFERENCE_SEQUENCE_HEADER, TARGET_INTERVALS_HEADER, BAIT_INTERVALS_HEADER};
|
||||
private static final String ANALYSIS_FILES = "analysis_files.txt";
|
||||
|
||||
private String path;
|
||||
private Map<String,Set<String>> headerValues = new HashMap<String,Set<String>>();
|
||||
|
||||
public PicardAnalysisFiles(String project, String sample) throws FileNotFoundException {
|
||||
this(PicardAggregationUtils.getSampleDir(project, sample) + ANALYSIS_FILES);
|
||||
}
|
||||
|
||||
public PicardAnalysisFiles(String project, String sample, int version) throws FileNotFoundException {
|
||||
this(PicardAggregationUtils.getSampleDir(project, sample, version) + ANALYSIS_FILES);
|
||||
}
|
||||
|
||||
public PicardAnalysisFiles(String path) throws FileNotFoundException {
|
||||
this.path = path;
|
||||
HashMap<String,Integer> headerIndexes = null;
|
||||
for (String line: new XReadLines(new File(path))) {
|
||||
if (line.startsWith("#"))
|
||||
continue;
|
||||
String[] values = line.split("\t");
|
||||
if (headerIndexes == null) {
|
||||
headerIndexes = new HashMap<String,Integer>();
|
||||
for (String header: ANALYSIS_HEADERS) {
|
||||
headerIndexes.put(header, ArrayUtils.indexOf(values, header));
|
||||
headerValues.put(header, new HashSet<String>());
|
||||
}
|
||||
} else {
|
||||
for (String header: ANALYSIS_HEADERS) {
|
||||
int index = headerIndexes.get(header);
|
||||
if (values.length <= index)
|
||||
throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line));
|
||||
String value = values[index];
|
||||
headerValues.get(header).add(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public String getReferenceSequence() {
|
||||
return getSingle(REFERENCE_SEQUENCE_HEADER);
|
||||
}
|
||||
|
||||
public String getTargetIntervals() {
|
||||
return getSingle(TARGET_INTERVALS_HEADER);
|
||||
}
|
||||
|
||||
public String getBaitIntervals() {
|
||||
return getSingle(BAIT_INTERVALS_HEADER);
|
||||
}
|
||||
|
||||
private String getSingle(String header) {
|
||||
Set<String> values = headerValues.get(header);
|
||||
if (values.size() > 1) {
|
||||
throw new UnsupportedOperationException(path + " contains more than one value for " + header + ": " + values);
|
||||
} else if (values.size() == 0) {
|
||||
return null;
|
||||
} else {
|
||||
String value = values.iterator().next();
|
||||
return "null".equals(value) ? null : value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.lang.NullArgumentException;
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline;
|
||||
import org.broadinstitute.sting.datasources.pipeline.PipelineProject;
|
||||
import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Automatically gets the latest version using PicardAggregationUtils.
|
||||
*/
|
||||
public class PicardPipeline {
|
||||
|
||||
protected static final String PROJECT_TAG = "SQUIDProject";
|
||||
protected static final String SAMPLE_TAG = "CollaboratorID";
|
||||
protected static final String PICARD_BAM_TYPE = "cleaned";
|
||||
|
||||
private PicardPipeline() {}
|
||||
|
||||
/**
|
||||
* Creates a new PicardPipeline
|
||||
* @param path Path to a tsv with project [tab] sample on each line or a pipeline yaml.
|
||||
* @return a new Picard
|
||||
* @throws FileNotFoundException when unable to find the file or any supporting files.
|
||||
*/
|
||||
public static Pipeline parse(File path) throws FileNotFoundException {
|
||||
if (path == null)
|
||||
throw new NullArgumentException("path");
|
||||
|
||||
Pipeline pipeline;
|
||||
if (path.getName().endsWith(".tsv")) {
|
||||
pipeline = new Pipeline();
|
||||
pipeline.getProject().setName(FilenameUtils.getBaseName(path.getPath()));
|
||||
for (String line: new XReadLines(path)) {
|
||||
String[] projectSample = line.split("\t");
|
||||
addSample(pipeline, projectSample[0], projectSample[1]);
|
||||
}
|
||||
} else if (path.getName().endsWith(".yaml")) {
|
||||
pipeline = YamlUtils.load(Pipeline.class, path);
|
||||
} else {
|
||||
throw new UserException.BadInput("Path does not end with .tsv or .yaml: " + path.getPath());
|
||||
}
|
||||
|
||||
update(pipeline);
|
||||
return pipeline;
|
||||
}
|
||||
|
||||
private static void update(Pipeline pipeline) throws FileNotFoundException {
|
||||
for (PipelineSample sample: pipeline.getSamples())
|
||||
updateSample(pipeline.getProject(), sample);
|
||||
}
|
||||
|
||||
private static void addSample(Pipeline pipeline, String project, String sample) {
|
||||
PipelineSample pipelineSample = new PipelineSample();
|
||||
pipelineSample.getTags().put(PROJECT_TAG, project);
|
||||
pipelineSample.getTags().put(SAMPLE_TAG, sample);
|
||||
pipeline.getSamples().add(pipelineSample);
|
||||
}
|
||||
|
||||
private static void updateSample(PipelineProject pipelineProject, PipelineSample pipelineSample) throws FileNotFoundException {
|
||||
if (!pipelineSample.getTags().containsKey(PROJECT_TAG) && !pipelineSample.getTags().containsKey(SAMPLE_TAG))
|
||||
return;
|
||||
String project = pipelineSample.getTags().get(PROJECT_TAG);
|
||||
String sample = pipelineSample.getTags().get(SAMPLE_TAG);
|
||||
int version = PicardAggregationUtils.getLatestVersion(project, sample);
|
||||
if (version <= 0)
|
||||
throw new UserException.BadInput("Project sample not found: " + project + "/" + sample);
|
||||
String bam = PicardAggregationUtils.getSampleBam(project, sample, version);
|
||||
if (pipelineSample.getId() == null)
|
||||
pipelineSample.setId(project + "_" + sample);
|
||||
pipelineSample.getBamFiles().put(PICARD_BAM_TYPE, new File(bam));
|
||||
|
||||
PicardAnalysisFiles analysis = new PicardAnalysisFiles(project, sample, version);
|
||||
if (pipelineProject.getReferenceFile() == null) {
|
||||
String referenceSequence = analysis.getReferenceSequence();
|
||||
ReferenceData referenceData = ReferenceData.getByReference(referenceSequence);
|
||||
pipelineProject.setReferenceFile(new File(referenceData.getReference()));
|
||||
pipelineProject.setRefseqTable(new File(referenceData.getRefseq()));
|
||||
if (analysis.getTargetIntervals() != null)
|
||||
pipelineProject.setIntervalList(new File(analysis.getTargetIntervals()));
|
||||
pipelineProject.setEvalDbsnp(new File(referenceData.getDbsnp(129)));
|
||||
if (referenceData.getDbsnpVersions().contains(132)) {
|
||||
pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(132)));
|
||||
} else {
|
||||
pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(129)));
|
||||
}
|
||||
} else {
|
||||
String referenceSequence = analysis.getReferenceSequence();
|
||||
if (!pipelineProject.getReferenceFile().getPath().equals(referenceSequence))
|
||||
throw new UserException.BadInput("Samples sequenced with different references");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,135 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Tracks data related to reference files at the Broad.
|
||||
*/
|
||||
public enum ReferenceData {
|
||||
/**
|
||||
* HG18 reference data
|
||||
*/
|
||||
HG18("hg18"),
|
||||
|
||||
/**
|
||||
* HG19 reference data
|
||||
*/
|
||||
HG19("hg19");
|
||||
|
||||
private static final String REFSEQ_DIR = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
|
||||
private static final String DBSNP_DIR = "/humgen/gsa-hpprojects/GATK/data/";
|
||||
|
||||
private final String name;
|
||||
private final String reference;
|
||||
private final String refseq;
|
||||
private final Map<Integer,String> dbsnps;
|
||||
|
||||
ReferenceData(String name) {
|
||||
this.name = name;
|
||||
Map<Integer,String> dbsnps = new TreeMap<Integer,String>();
|
||||
if ("hg18".equals(name)) {
|
||||
this.reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
||||
this.refseq = REFSEQ_DIR + "refGene-big-table-hg18.txt";
|
||||
dbsnps.put(129, DBSNP_DIR + "dbsnp_129_hg18.rod");
|
||||
} else if ("hg19".equals(name)) {
|
||||
this.reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
|
||||
this.refseq = REFSEQ_DIR + "refGene-big-table-hg19.txt";
|
||||
dbsnps.put(129, DBSNP_DIR + "dbsnp_129_b37.vcf");
|
||||
dbsnps.put(132, DBSNP_DIR + "dbsnp_132_b37.vcf");
|
||||
} else
|
||||
throw new UnsupportedOperationException("Unknown reference: " + name);
|
||||
this.dbsnps = Collections.unmodifiableMap(dbsnps);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name of the reference.
|
||||
* @return the name of the reference.
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the path to the fasta.
|
||||
* @return the path to the fasta.
|
||||
*/
|
||||
public String getReference() {
|
||||
return reference;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the path to the refseq table.
|
||||
* @return the path to the refseq table.
|
||||
*/
|
||||
public String getRefseq() {
|
||||
return refseq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the dbsnp versions available.
|
||||
* @return the dbsnp versions available.
|
||||
*/
|
||||
public Set<Integer> getDbsnpVersions() {
|
||||
return dbsnps.keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the dbsnp path for the version.
|
||||
* @param version version from getDbsnpVersions()
|
||||
* @return the dbsnp path for the version.
|
||||
*/
|
||||
public String getDbsnp(int version) {
|
||||
return dbsnps.get(version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the dbsnp type for the version, "VCF" or "DBSNP".
|
||||
* @param version version from getDbsnpVersions()
|
||||
* @return the dbsnp type for the version, "VCF" or "DBSNP".
|
||||
*/
|
||||
public String getDbsnpType(int version) {
|
||||
String dbsnp = getDbsnp(version);
|
||||
if (dbsnp == null)
|
||||
return null;
|
||||
return dbsnp.toLowerCase().endsWith(".vcf") ? "VCF" : "DBSNP";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the reference data based on the path or null.
|
||||
* @param reference path to the reference
|
||||
* @return the reference data based on the path or null.
|
||||
*/
|
||||
public static ReferenceData getByReference(String reference) {
|
||||
for (ReferenceData data: ReferenceData.values())
|
||||
if (data.reference.equals(reference))
|
||||
return data;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -68,13 +68,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
ReferenceSequence seq = null;
|
||||
}
|
||||
|
||||
private static ThreadLocal<Cache> cache;
|
||||
private ThreadLocal<Cache> cache;
|
||||
|
||||
static {
|
||||
{
|
||||
resetThreadLocalCache();
|
||||
};
|
||||
}
|
||||
|
||||
protected static void resetThreadLocalCache() {
|
||||
protected void resetThreadLocalCache() {
|
||||
cache = new ThreadLocal<Cache> () {
|
||||
@Override protected Cache initialValue() {
|
||||
return new Cache();
|
||||
|
|
|
|||
|
|
@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
* @return vc subcontext
|
||||
*/
|
||||
public VariantContext subContextFromGenotypes(Collection<Genotype> genotypes, Set<Allele> alleles) {
|
||||
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes());
|
||||
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
|
||||
1,
|
||||
Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e"));
|
||||
Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
|
||||
executeTest("test dbsnp", spec2);
|
||||
|
||||
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
|
||||
"d33212a84368e821cbedecd4f59756d6", // tranches
|
||||
"a35cd067f378442eee8cd5edeea92be0", // recal file
|
||||
"126d52843f4a57199ee97750ffc16a07"); // cut VCF
|
||||
"4652dca41222bebdf9d9fda343b2a835", // recal file
|
||||
"5350b1a4c1250cf3b77ca45327c04711"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
|
|||
" --NO_HEADER" +
|
||||
" -o %s",
|
||||
1,
|
||||
Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877")
|
||||
Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d")
|
||||
);
|
||||
|
||||
executeTest("testSimpleVCFStreaming", spec);
|
||||
|
|
@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
|
|||
" -EV CompOverlap -noEV -noST" +
|
||||
" -o %s",
|
||||
1,
|
||||
Arrays.asList("f60729c900bc8368717653b3fad80d1e")
|
||||
Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e"
|
||||
);
|
||||
executeTest("testVCFStreamingChain", selectTestSpec);
|
||||
|
||||
|
|
|
|||
|
|
@ -22,8 +22,10 @@
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.datasources.pipeline;
|
||||
package org.broadinstitute.sting.pipeline;
|
||||
|
||||
import org.broadinstitute.sting.pipeline.Pipeline;
|
||||
import org.broadinstitute.sting.pipeline.PipelineSample;
|
||||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils;
|
||||
|
||||
|
|
@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.BeforeMethod;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.*;
|
||||
|
||||
/**
|
||||
|
|
@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {
|
|||
|
||||
System.out.println(Utils.dupString('-', 40));
|
||||
System.out.println("reads : " + new String(test.readBases));
|
||||
ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
|
||||
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
|
||||
printQuals(System.out, "in-quals:", test.quals, false);
|
||||
printQuals(System.out, "bq-quals:", result.bq, false);
|
||||
for (int i = 0; i < test.quals.length; i++) {
|
||||
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
|
||||
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
|
||||
|
|
@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
|
||||
out.print(prefix);
|
||||
for ( int i = 0; i < quals.length; i++) {
|
||||
if ( asInt ) {
|
||||
out.printf("%2d", (int)quals[i]);
|
||||
if ( i+1 != quals.length ) out.print(",");
|
||||
} else
|
||||
out.print((char)(quals[i]+33));
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
public class PicardAggregationUtilsUnitTest {
|
||||
public static final String PROJECT = "C474";
|
||||
public static final String SAMPLE = "NA19651";
|
||||
public static final String MISSING_PROJECT = "C0";
|
||||
public static final String MISSING_SAMPLE = "0";
|
||||
private int latestVersion = -1;
|
||||
|
||||
@Test
|
||||
public void testGetLatestVersion() {
|
||||
latestVersion = PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE);
|
||||
System.out.println(String.format("Latest version for %s %s is %d", PROJECT, SAMPLE, latestVersion));
|
||||
Assert.assertTrue(latestVersion > 0);
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE, latestVersion), latestVersion);
|
||||
}
|
||||
|
||||
@Test(dependsOnMethods = "testGetLatestVersion")
|
||||
public void testGetSampleBam() throws Exception {
|
||||
String test = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE);
|
||||
String latest = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE, latestVersion);
|
||||
Assert.assertEquals(test, latest);
|
||||
}
|
||||
|
||||
@Test(dependsOnMethods = "testGetLatestVersion")
|
||||
public void testGetSampleDir() throws Exception {
|
||||
String test = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE);
|
||||
String latest = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE, latestVersion);
|
||||
Assert.assertEquals(test, latest);
|
||||
}
|
||||
|
||||
@Test(dependsOnMethods = "testGetLatestVersion")
|
||||
public void testIsFinished() {
|
||||
Assert.assertTrue(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion));
|
||||
Assert.assertFalse(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion + 1));
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = FileNotFoundException.class)
|
||||
public void testMissingSampleBam() throws Exception {
|
||||
PicardAggregationUtils.getSampleBam(MISSING_PROJECT, MISSING_SAMPLE);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = FileNotFoundException.class)
|
||||
public void testMissingSampleDir() throws Exception {
|
||||
PicardAggregationUtils.getSampleDir(MISSING_PROJECT, MISSING_SAMPLE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLatestVersionMissing() {
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE), 0);
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, -1), -1);
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 0), 0);
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 1), 1);
|
||||
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 2), 2);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
|
||||
|
||||
public class PicardAnalysisFilesUnitTest extends BaseTest {
|
||||
@Test
|
||||
public void testParseLatest() throws Exception {
|
||||
PicardAnalysisFiles files = new PicardAnalysisFiles(PROJECT, SAMPLE);
|
||||
Assert.assertNotNull(files.getPath());
|
||||
files = new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE));
|
||||
Assert.assertNotNull(files.getPath());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseValid() throws Exception {
|
||||
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file.txt");
|
||||
Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
|
||||
Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
|
||||
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseValidWithComments() throws Exception {
|
||||
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt");
|
||||
Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
|
||||
Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
|
||||
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = FileNotFoundException.class)
|
||||
public void testParseBadPath() throws Exception {
|
||||
new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt");
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = FileNotFoundException.class)
|
||||
public void testParseMissingLatest() throws Exception {
|
||||
new PicardAnalysisFiles(MISSING_PROJECT, MISSING_SAMPLE);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = FileNotFoundException.class)
|
||||
public void testParseMissingVersion() throws Exception {
|
||||
new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE) + 2);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = UnsupportedOperationException.class)
|
||||
public void testParseMultipleReferences() throws Exception {
|
||||
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_different_references.txt");
|
||||
file.getReferenceSequence();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.datasources.pipeline.Pipeline;
|
||||
import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
|
||||
import org.broadinstitute.sting.utils.yaml.YamlUtils;
|
||||
import org.testng.annotations.Test;
|
||||
import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
public class PicardPipelineUnitTest {
|
||||
@Test
|
||||
public void testParseTsv() throws IOException {
|
||||
File tsv = writeTsv(PROJECT, SAMPLE);
|
||||
Pipeline pipeline = PicardPipeline.parse(tsv);
|
||||
validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseTsvWithPicardComments() throws Exception {
|
||||
File tsv = writeTsv("C460", "HG01359");
|
||||
PicardPipeline.parse(tsv);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseYaml() throws IOException {
|
||||
File yaml = writeYaml("project_name", PROJECT, SAMPLE);
|
||||
Pipeline pipeline = PicardPipeline.parse(yaml);
|
||||
validatePipeline(pipeline, "project_name");
|
||||
}
|
||||
|
||||
private void validatePipeline(Pipeline pipeline, String name) {
|
||||
Assert.assertEquals(pipeline.getProject().getName(), name);
|
||||
Assert.assertTrue(pipeline.getProject().getReferenceFile().exists(), "reference not found");
|
||||
Assert.assertTrue(pipeline.getProject().getIntervalList().exists(), "intervals not found");
|
||||
Assert.assertTrue(pipeline.getProject().getRefseqTable().exists(), "refseq not found");
|
||||
Assert.assertTrue(pipeline.getProject().getGenotypeDbsnp().exists(), "genotype dbsnp not found");
|
||||
Assert.assertTrue(pipeline.getProject().getEvalDbsnp().exists(), "eval dbsnp not found");
|
||||
Assert.assertEquals(pipeline.getSamples().size(), 1);
|
||||
for (PipelineSample sample: pipeline.getSamples()) {
|
||||
Assert.assertEquals(sample.getId(), PROJECT + "_" + SAMPLE);
|
||||
Assert.assertTrue(sample.getBamFiles().get(PicardPipeline.PICARD_BAM_TYPE).exists(), "bam not found");
|
||||
Assert.assertEquals(sample.getTags().get(PicardPipeline.PROJECT_TAG), PROJECT);
|
||||
Assert.assertEquals(sample.getTags().get(PicardPipeline.SAMPLE_TAG), SAMPLE);
|
||||
}
|
||||
}
|
||||
|
||||
private File writeTsv(String project, String sample) throws IOException {
|
||||
File tsv = BaseTest.createTempFile("pipeline", ".tsv");
|
||||
FileUtils.writeLines(tsv, Collections.singletonList(project + "\t" + sample));
|
||||
return tsv;
|
||||
}
|
||||
|
||||
private File writeYaml(String projectName, String project, String sample) throws IOException {
|
||||
File yaml = BaseTest.createTempFile("pipeline", ".yaml");
|
||||
PipelineSample pipelineSample = new PipelineSample();
|
||||
pipelineSample.getTags().put(PicardPipeline.PROJECT_TAG, project);
|
||||
pipelineSample.getTags().put(PicardPipeline.SAMPLE_TAG, sample);
|
||||
Pipeline pipeline = new Pipeline();
|
||||
pipeline.getProject().setName(projectName);
|
||||
pipeline.getSamples().add(pipelineSample);
|
||||
YamlUtils.dump(pipeline, yaml);
|
||||
return yaml;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.broad;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class ReferenceDataUnitTest {
|
||||
@Test
|
||||
public void testNames() {
|
||||
Assert.assertEquals(ReferenceData.HG18.getName(), "hg18");
|
||||
Assert.assertEquals(ReferenceData.HG19.getName(), "hg19");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilesExist() {
|
||||
for (ReferenceData data: ReferenceData.values()) {
|
||||
Assert.assertTrue(new File(data.getReference()).exists());
|
||||
Assert.assertTrue(new File(data.getRefseq()).exists());
|
||||
for (int version: data.getDbsnpVersions()) {
|
||||
Assert.assertTrue(new File(data.getDbsnp(version)).exists());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDbsnps() {
|
||||
Assert.assertTrue(new File(ReferenceData.HG18.getDbsnp(129)).exists());
|
||||
Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(129)).exists());
|
||||
Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(132)).exists());
|
||||
Assert.assertNull(ReferenceData.HG19.getDbsnp(130));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDbsnpTypes() {
|
||||
Assert.assertEquals(ReferenceData.HG18.getDbsnpType(129), "DBSNP");
|
||||
Assert.assertEquals(ReferenceData.HG19.getDbsnpType(129), "VCF");
|
||||
Assert.assertEquals(ReferenceData.HG19.getDbsnpType(132), "VCF");
|
||||
Assert.assertNull(ReferenceData.HG19.getDbsnpType(130));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetByReference() {
|
||||
Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg18Reference), ReferenceData.HG18);
|
||||
Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg19Reference), ReferenceData.HG19);
|
||||
Assert.assertEquals(ReferenceData.getByReference("none"), null);
|
||||
}
|
||||
}
|
||||
|
|
@ -34,11 +34,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
|
||||
private static final List<Integer> CACHE_SIZES = Arrays.asList(-1, 10, 1000);
|
||||
|
||||
@BeforeMethod
|
||||
public void clearCache() {
|
||||
CachingIndexedFastaSequenceFile.resetThreadLocalCache();
|
||||
}
|
||||
|
||||
@DataProvider(name = "fastas")
|
||||
public Object[][] createData1() {
|
||||
List<Object[]> params = new ArrayList<Object[]>();
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
</executable>
|
||||
<resources>
|
||||
<!-- Supplemental scripts for graph generation, etc. -->
|
||||
<file name="R/plot_residualError_OtherCovariate.R" />
|
||||
<file name="R/plot_residualError_QualityScoreCovariate.R" />
|
||||
<file name="public/R/plot_residualError_OtherCovariate.R" />
|
||||
<file name="public/R/plot_residualError_QualityScoreCovariate.R" />
|
||||
</resources>
|
||||
</package>
|
||||
|
|
|
|||
|
|
@ -40,10 +40,10 @@
|
|||
</executable>
|
||||
<resources>
|
||||
<!-- Sample reads and reference files -->
|
||||
<file name="testdata/exampleBAM.bam" />
|
||||
<file name="testdata/exampleBAM.bam.bai" />
|
||||
<file name="testdata/exampleFASTA.fasta" />
|
||||
<file name="testdata/exampleFASTA.fasta.fai" />
|
||||
<file name="testdata/exampleFASTA.dict" />
|
||||
<file name="public/testdata/exampleBAM.bam" />
|
||||
<file name="public/testdata/exampleBAM.bam.bai" />
|
||||
<file name="public/testdata/exampleFASTA.fasta" />
|
||||
<file name="public/testdata/exampleFASTA.fasta.fai" />
|
||||
<file name="public/testdata/exampleFASTA.dict" />
|
||||
</resources>
|
||||
</package>
|
||||
|
|
|
|||
|
|
@ -13,12 +13,14 @@
|
|||
</modules>
|
||||
<resources>
|
||||
<!-- GATK sample code and build scripts -->
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java" />
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java" />
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java" />
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java" />
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java" />
|
||||
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java" />
|
||||
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java" />
|
||||
<file name="public/R/plot_Tranches.R" />
|
||||
<file name="public/R/titvFPEst.R" />
|
||||
</resources>
|
||||
<release>
|
||||
<executable directory="/humgen/gsa-hpprojects/GATK/bin" symlink="current" />
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@
|
|||
<package name="org.broadinstitute.sting.queue.**" />
|
||||
|
||||
<!-- Pipeline + Utils -->
|
||||
<package name="org.broadinstitute.sting.datasources.pipeline.**" />
|
||||
<package name="org.broadinstitute.sting.pipeline.**" />
|
||||
<package name="org.broadinstitute.sting.utils.**" />
|
||||
|
||||
<!-- Scala -->
|
||||
|
|
|
|||
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# Runs the liftover tool on a VCF and properly handles the output
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $in = undef;
|
||||
my $gatk = undef;
|
||||
my $chain = undef;
|
||||
my $newRef = undef;
|
||||
my $oldRef = undef;
|
||||
my $out = undef;
|
||||
my $tmp = "/tmp";
|
||||
my $recordOriginalLocation = 0;
|
||||
GetOptions( "vcf=s" => \$in,
|
||||
"gatk=s" => \$gatk,
|
||||
"chain=s" => \$chain,
|
||||
"newRef=s" => \$newRef,
|
||||
"oldRef=s" => \$oldRef,
|
||||
"out=s" => \$out,
|
||||
"tmp=s" => \$tmp,
|
||||
"recordOriginalLocation" => \$recordOriginalLocation);
|
||||
|
||||
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
|
||||
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n\t-recordOriginalLocation \t\t<Should we record what the original location was in the INFO field?; defaults to false>\n";
|
||||
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
# generate a random number
|
||||
my $random_number = rand();
|
||||
my $tmp_prefix = "$tmp/$random_number";
|
||||
print "Writing temporary files to prefix: $tmp_prefix\n";
|
||||
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
|
||||
|
||||
# lift over the file
|
||||
print "Lifting over the vcf...";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
if ($recordOriginalLocation) {
|
||||
$cmd .= " -recordOriginalLocation";
|
||||
}
|
||||
system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# we need to sort the lifted over file now
|
||||
print "\nRe-sorting the vcf...\n";
|
||||
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
|
||||
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
|
||||
|
||||
# write the header
|
||||
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
|
||||
my $inHeader = 1;
|
||||
while ( $inHeader == 1 ) {
|
||||
my $line = <UNSORTED>;
|
||||
if ( $line !~ m/^#/ ) {
|
||||
$inHeader = 0;
|
||||
} else {
|
||||
print SORTED "$line";
|
||||
}
|
||||
}
|
||||
close(UNSORTED);
|
||||
close(SORTED);
|
||||
|
||||
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/public/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf";
|
||||
system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# Filter the VCF for bad records
|
||||
print "\nFixing/removing bad records...\n";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
|
||||
system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# clean up
|
||||
unlink $unsorted_vcf;
|
||||
unlink $sorted_vcf;
|
||||
my $sorted_index = "$sorted_vcf.idx";
|
||||
unlink $sorted_index;
|
||||
|
||||
print "\nDone!\n";
|
||||
|
||||
sub quit {
|
||||
print "\n$_[0]\n";
|
||||
exit(1);
|
||||
}
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
sub usage {
|
||||
|
||||
print "\nUsage:\n";
|
||||
print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n";
|
||||
|
||||
print " Sorts lines of the input file INFILE according\n";
|
||||
print " to the reference contig order specified by the\n";
|
||||
print " reference dictionary REF_DICT (.fai file).\n";
|
||||
print " The sort is stable. If -k option is not specified,\n";
|
||||
print " it is assumed that the contig name is the first\n";
|
||||
print " field in each line.\n\n";
|
||||
print " INPUT input file to sort. If '-' is specified, \n";
|
||||
print " then reads from STDIN.\n";
|
||||
print " REF_DICT .fai file, or ANY file that has contigs, in the\n";
|
||||
print " desired soting order, as its first column.\n";
|
||||
print " --k POS : contig name is in the field POS (1-based)\n";
|
||||
print " of input lines.\n\n";
|
||||
print " --tmp DIR : temp directory [default=/tmp]\n\n";
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
my $pos = 1;
|
||||
my $tmp = "/tmp";
|
||||
GetOptions( "k:i" => \$pos,
|
||||
"tmp=s" => \$tmp);
|
||||
|
||||
$pos--;
|
||||
|
||||
usage() if ( scalar(@ARGV) == 0 );
|
||||
|
||||
if ( scalar(@ARGV) != 2 ) {
|
||||
print "Wrong number of arguments\n";
|
||||
usage();
|
||||
}
|
||||
|
||||
my $input_file = $ARGV[0];
|
||||
my $dict_file = $ARGV[1];
|
||||
|
||||
|
||||
open(DICT, "< $dict_file") or die("Can not open $dict_file: $!");
|
||||
|
||||
my %ref_order;
|
||||
|
||||
my $n = 0;
|
||||
while ( <DICT> ) {
|
||||
chomp;
|
||||
my ($contig, $rest) = split "\t";
|
||||
die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} );
|
||||
|
||||
$ref_order{$contig} = $n;
|
||||
$n++;
|
||||
}
|
||||
|
||||
close DICT;
|
||||
#we have loaded contig ordering now
|
||||
|
||||
my $INPUT;
|
||||
if ($input_file eq "-" ) {
|
||||
$INPUT = "STDIN";
|
||||
} else {
|
||||
open($INPUT, "< $input_file") or die("Can not open $input_file: $!");
|
||||
}
|
||||
|
||||
my %temp_outputs;
|
||||
|
||||
while ( <$INPUT> ) {
|
||||
|
||||
my @fields = split '\s';
|
||||
die("Specified field position exceeds the number of fields:\n$_")
|
||||
if ( $pos >= scalar(@fields) );
|
||||
|
||||
my $contig = $fields[$pos];
|
||||
if ( $contig =~ m/:/ ) {
|
||||
my @loc = split(/:/, $contig);
|
||||
# print $contig . " " . $loc[0] . "\n";
|
||||
$contig = $loc[0]
|
||||
}
|
||||
chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line
|
||||
|
||||
my $order;
|
||||
if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; }
|
||||
else {
|
||||
$ref_order{$contig} = $n;
|
||||
$order = $n; # input line has contig that was not in the dict;
|
||||
$n++; # this contig will go at the end of the output,
|
||||
# after all known contigs
|
||||
}
|
||||
|
||||
my $fhandle;
|
||||
if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} }
|
||||
else {
|
||||
#print "opening $order $$ $_\n";
|
||||
open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or
|
||||
die ( "Can not open temporary file $order: $!");
|
||||
$temp_outputs{$order} = $fhandle;
|
||||
}
|
||||
|
||||
# we got the handle to the temp file that keeps all
|
||||
# lines with contig $contig
|
||||
|
||||
print $fhandle $_; # send current line to its corresponding temp file
|
||||
}
|
||||
|
||||
close $INPUT;
|
||||
|
||||
foreach my $f ( values %temp_outputs ) { close $f; }
|
||||
|
||||
# now collect back into single output stream:
|
||||
|
||||
for ( my $i = 0 ; $i < $n ; $i++ ) {
|
||||
# if we did not have any lines on contig $i, then there's
|
||||
# no temp file and nothing to do
|
||||
next if ( ! defined $temp_outputs{$i} ) ;
|
||||
|
||||
my $f;
|
||||
open ( $f, "< $tmp/sortByRef.$$.$i.tmp" );
|
||||
while ( <$f> ) { print ; }
|
||||
close $f;
|
||||
|
||||
unlink "$tmp/sortByRef.$$.$i.tmp";
|
||||
}
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
package core
|
||||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
||||
|
||||
import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord}
|
||||
|
||||
import scala.io.Source._
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
|
||||
import org.broadinstitute.sting.queue.extensions.picard._
|
||||
import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
|
||||
import net.sf.samtools.SAMFileHeader.SortOrder
|
||||
|
||||
|
||||
class DataProcessingPipeline extends QScript {
|
||||
|
|
@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript {
|
|||
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
|
||||
var useBWApe: Boolean = false
|
||||
|
||||
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
|
||||
var bwaThreads: Int = 1
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
|
|
@ -180,6 +182,7 @@ class DataProcessingPipeline extends QScript {
|
|||
var realignedBams: List[File] = List()
|
||||
var index = 1
|
||||
for (bam <- bams) {
|
||||
val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" )
|
||||
val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
|
||||
val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
|
||||
val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
|
||||
|
|
@ -190,11 +193,12 @@ class DataProcessingPipeline extends QScript {
|
|||
bwa_sam_se(bam, saiFile1, realignedSamFile))
|
||||
}
|
||||
else {
|
||||
add(bwa_aln_pe(bam, saiFile1, 1),
|
||||
bwa_aln_pe(bam, saiFile2, 2),
|
||||
bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile))
|
||||
add(sortSam(bam, readSortedBam, SortOrder.queryname),
|
||||
bwa_aln_pe(readSortedBam, saiFile1, 1),
|
||||
bwa_aln_pe(readSortedBam, saiFile2, 2),
|
||||
bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile))
|
||||
}
|
||||
add(sortSam(realignedSamFile, realignedBamFile))
|
||||
add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
|
||||
addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
|
||||
realignedBams :+= rgRealignedBamFile
|
||||
index = index + 1
|
||||
|
|
@ -385,9 +389,10 @@ class DataProcessingPipeline extends QScript {
|
|||
this.jobName = queueLogDir + outBam + ".joinBams"
|
||||
}
|
||||
|
||||
case class sortSam (inSam: File, outBam: File) extends SortSam {
|
||||
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam {
|
||||
this.input = List(inSam)
|
||||
this.output = outBam
|
||||
this.sortOrder = sortOrderP
|
||||
this.memoryLimit = 4
|
||||
this.isIntermediate = true
|
||||
this.analysisName = queueLogDir + outBam + ".sortSam"
|
||||
|
|
@ -430,7 +435,7 @@ class DataProcessingPipeline extends QScript {
|
|||
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file") var sai = outSai
|
||||
def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai
|
||||
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
|
||||
this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
|
||||
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
|
||||
}
|
||||
|
|
@ -438,7 +443,7 @@ class DataProcessingPipeline extends QScript {
|
|||
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs {
|
||||
@Input(doc="bam file to be aligned") var bam = inBam
|
||||
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
|
||||
def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
|
||||
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
|
||||
this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
|
||||
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package core
|
||||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package core
|
||||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
|
@ -41,9 +41,9 @@ class RecalibrateBaseQualities extends QScript {
|
|||
|
||||
nContigs = getNumberOfContigs(input)
|
||||
|
||||
val recalFile1: File = new File("recal1.csv")
|
||||
val recalFile2: File = new File("recal2.csv")
|
||||
val recalBam: File = swapExt(input, ".bam", "recal.bam")
|
||||
val recalFile1: File = swapExt(input, ".bam", "recal1.csv")
|
||||
val recalFile2: File = swapExt(input, ".bam", "recal2.csv")
|
||||
val recalBam: File = swapExt(input, ".bam", "recal.bam")
|
||||
val path1: String = "before"
|
||||
val path2: String = "after"
|
||||
|
||||
|
|
@ -88,4 +88,4 @@ class RecalibrateBaseQualities extends QScript {
|
|||
this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"
|
||||
this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package core
|
||||
package org.broadinstitute.sting.queue.qscripts
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.examples
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.examples
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.examples
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.examples
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.examples
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
|
||||
class HelloWorld extends QScript {
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.lib
|
||||
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, VariantsToTable}
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
|
|
@ -1,211 +0,0 @@
|
|||
package org.broadinstitute.sting.scala
|
||||
/**
|
||||
import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall}
|
||||
import java.io.File
|
||||
import net.sf.samtools.SAMRecord
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
|
||||
import org.broadinstitute.sting.utils.Pair
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData
|
||||
import utils._
|
||||
import cmdLine.Argument
|
||||
|
||||
class TransitionTable() {
|
||||
var table: Array[Array[Double]] = new Array[Array[Double]](4,4)
|
||||
|
||||
def makeKey(b: Char): Int = {
|
||||
return BaseUtils.simpleBaseToBaseIndex(b)
|
||||
}
|
||||
|
||||
def fromKey(i: Int): Char = {
|
||||
return BaseUtils.baseIndexToSimpleBase(i)
|
||||
}
|
||||
|
||||
def add(ref: Char, subst: Char) = {
|
||||
table(makeKey(ref))(makeKey(subst)) += 1
|
||||
}
|
||||
|
||||
def get(i: Int, j: Int): Double = {
|
||||
return table(i)(j)
|
||||
}
|
||||
|
||||
def set(i: Int, j: Int, d: Double) = {
|
||||
//printf("Setting %d / %d => %f%n", i, j, d)
|
||||
table(i)(j) = d
|
||||
}
|
||||
|
||||
def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = {
|
||||
return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j))
|
||||
}
|
||||
|
||||
def header(): String = {
|
||||
//return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray)
|
||||
return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray)
|
||||
}
|
||||
|
||||
override def toString(): String = {
|
||||
return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray)
|
||||
}
|
||||
|
||||
def sum(): Double = {
|
||||
return sumList(table.toList map (x => sumList(x.toList)))
|
||||
}
|
||||
|
||||
def sumList(xs: List[Double]): Double = {
|
||||
return (0.0 :: xs) reduceLeft {(x, y) => x + y}
|
||||
}
|
||||
|
||||
def getRateMatrix(): TransitionTable = {
|
||||
var sums = (table.toList map (x => sumList(x.toList))).toArray
|
||||
var t = new TransitionTable()
|
||||
|
||||
//println(sums.toList)
|
||||
this mapEntries ((i,j,x) => t.set(i, j, x / sums(i)))
|
||||
|
||||
return t
|
||||
}
|
||||
}
|
||||
|
||||
**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] {
|
||||
private var MIN_MAPPING_QUALITY = 30
|
||||
private var MIN_BASE_QUALITY = 20
|
||||
private var MIN_LOD = 5
|
||||
private var PRINT_FREQ = 1000000
|
||||
private var CARE_ABOUT_STRAND = true
|
||||
|
||||
private val SSG = new UnifiedGenotyper()
|
||||
private val table1 = new TransitionTable()
|
||||
private val tableFWD = new TransitionTable()
|
||||
private val tableREV = new TransitionTable()
|
||||
private val table2 = new TransitionTable()
|
||||
|
||||
@Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false }
|
||||
var VERBOSE: Boolean = false
|
||||
|
||||
override def initialize() {
|
||||
SSG.initialize();
|
||||
}
|
||||
|
||||
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = {
|
||||
val callPair = SSG.map(tracker, ref, context)
|
||||
//val call = callPair.getFirst().get(0)
|
||||
val pileup = new ReadBackedPileup(ref.getBase(), context)
|
||||
|
||||
def hasNoNs(): Boolean = {
|
||||
return ! (pileup.getBases() exists ('N' ==))
|
||||
}
|
||||
|
||||
if ( isDefinitelyHomRef(callPair) && hasNoNs() ) {
|
||||
var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases)
|
||||
|
||||
//printf("nonRefBases %s%n", nonRefBases)
|
||||
if ( nonRefBases.length > 0 ) {
|
||||
var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup)
|
||||
var nonRefRead: SAMRecord = nonRefReads.head
|
||||
|
||||
nonRefBases.length match {
|
||||
case 1 =>
|
||||
//if ( goodRead(nonRefRead,offsets.head) ) {
|
||||
//printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
|
||||
//}
|
||||
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1)
|
||||
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD )
|
||||
case 2 =>
|
||||
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2)
|
||||
addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2)
|
||||
case x =>
|
||||
}
|
||||
|
||||
if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 )
|
||||
printNonRefBases(ref, nonRefReads, offsets, context.getReads.size())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*)
|
||||
|
||||
def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = {
|
||||
out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth)
|
||||
}
|
||||
|
||||
def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = {
|
||||
if ( goodRead(nonRefRead, offset) ) {
|
||||
//printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
|
||||
//println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality)
|
||||
if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) {
|
||||
// it's on the reverse strand
|
||||
val refComp: Char = BaseUtils.simpleComplement(ref.getBase)
|
||||
val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase)
|
||||
|
||||
//printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp)
|
||||
table.add(refComp, nonRefComp)
|
||||
} else {
|
||||
table.add(ref.getBase, nonRefBase)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = {
|
||||
def getBase(i: Int): Char = {
|
||||
var offset = pileup.getOffsets().get(i)
|
||||
return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int])
|
||||
}
|
||||
|
||||
var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i
|
||||
var reads = subset map (i => pileup.getReads().get(i))
|
||||
var offsets = subset map (i => pileup.getOffsets().get(i))
|
||||
|
||||
//println(reads, offsets, subset)
|
||||
return (reads, offsets)
|
||||
}
|
||||
|
||||
|
||||
def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = {
|
||||
splitBases(refBase, pileup) match {
|
||||
case (refs, nonRefs) =>
|
||||
//printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString)
|
||||
return nonRefs
|
||||
}
|
||||
}
|
||||
|
||||
def goodRead( read: SAMRecord, offset: Integer ): Boolean = {
|
||||
return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY
|
||||
}
|
||||
|
||||
def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = {
|
||||
val refBases = pileup.toList filter (refBase ==)
|
||||
val nonRefBases = pileup.toList filter (refBase !=)
|
||||
return (refBases, nonRefBases)
|
||||
}
|
||||
|
||||
def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = {
|
||||
if ( call == null )
|
||||
return false;
|
||||
|
||||
return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD
|
||||
}
|
||||
|
||||
def reduceInit(): Int = {
|
||||
return 0;
|
||||
}
|
||||
|
||||
def reduce(m: Unit, r: Int): Int = {
|
||||
return r;
|
||||
}
|
||||
|
||||
override def onTraversalDone(r: Int) = {
|
||||
out.println("-------------------- FINAL RESULT --------------------")
|
||||
out.println("type\t" + table1.header)
|
||||
def print1(t: String, x: TransitionTable) = {
|
||||
out.println(t + "\t" + x)
|
||||
}
|
||||
|
||||
print1("1-mismatch", table1)
|
||||
print1("2-mismatch", table2)
|
||||
print1("1-mismatch-fwd", tableFWD)
|
||||
print1("1-mismatch-rev", tableREV)
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
package org.broadinstitute.sting.scala
|
||||
|
||||
import java.io.PrintStream
|
||||
import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext}
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
|
||||
import collection.JavaConversions._
|
||||
import collection.immutable.List
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature
|
||||
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature
|
||||
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature
|
||||
import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker}
|
||||
import org.broadinstitute.sting.commandline.{Output, Argument}
|
||||
import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc}
|
||||
import collection.mutable.{ListBuffer, HashSet}
|
||||
import java.lang.Math
|
||||
|
||||
class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] {
|
||||
@Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb")
|
||||
var minPropOverlap : Double = 0.50 // default to 50%
|
||||
@Output var out : PrintStream = _
|
||||
|
||||
override def reduceInit : List[IntervalInfoBuilder] = {
|
||||
Nil
|
||||
}
|
||||
|
||||
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = {
|
||||
val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList
|
||||
val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList
|
||||
val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList
|
||||
var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase)
|
||||
if ( refSeqData != null ) {
|
||||
amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature])
|
||||
}
|
||||
if ( tableMetaData != null ) {
|
||||
amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) =>
|
||||
new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _)))
|
||||
}
|
||||
amd.newIvals = ivalsOfInterest.map(_.getLocation)
|
||||
return amd
|
||||
}
|
||||
|
||||
override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = {
|
||||
rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done)))
|
||||
val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) :::
|
||||
mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) )
|
||||
newList.foreach(_.update(mapMetaData))
|
||||
return newList
|
||||
}
|
||||
|
||||
override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = {
|
||||
finalList.foreach(u => out.print("%s%n".format(u.done)))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class AnnotationMetaData(loc: GenomeLoc, ref: Byte) {
|
||||
val locus : GenomeLoc = loc
|
||||
var newIvals : List[GenomeLoc] = Nil
|
||||
var refSeqFeatures : List[RefSeqFeature] = Nil
|
||||
var tableFeatures : List[(GenomeLoc,String)] = Nil
|
||||
val refBase : Byte = ref
|
||||
}
|
||||
|
||||
class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) {
|
||||
|
||||
val OVERLAP_PROP : Double = minProp
|
||||
|
||||
var metaData : HashSet[String] = new HashSet[String]
|
||||
var seenTranscripts : HashSet[String] = new HashSet[String]
|
||||
var baseContent : ListBuffer[Byte] = new ListBuffer[Byte]
|
||||
var location : GenomeLoc = loc
|
||||
var gcContent : Double = _
|
||||
var entropy : Double = _
|
||||
var finalized : Boolean = false
|
||||
|
||||
|
||||
def update(mmd : AnnotationMetaData) = {
|
||||
baseContent += mmd.refBase
|
||||
mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u))
|
||||
mmd.tableFeatures.filter( (t : (GenomeLoc,String)) =>
|
||||
! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2))
|
||||
}
|
||||
|
||||
def addRefSeq( rs: RefSeqFeature ) : Unit = {
|
||||
seenTranscripts.add(rs.getTranscriptUniqueGeneName)
|
||||
val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp )
|
||||
if ( exons.size > 0 ) {
|
||||
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _)))
|
||||
} else {
|
||||
if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) {
|
||||
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR"))
|
||||
} else {
|
||||
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron"))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def done : String = {
|
||||
finalized = true
|
||||
def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 }
|
||||
gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size()
|
||||
entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b)))
|
||||
val meta : String = metaData.reduceLeft(_ + "\t" + _)
|
||||
return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta)
|
||||
}
|
||||
|
||||
def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = {
|
||||
if(byteList.size == 1) return 0
|
||||
Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) +
|
||||
calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => {
|
||||
if ( b.equals(byteList(1)) ) {
|
||||
a.dropRight(1) :+ (a.last ++ b)
|
||||
} else {
|
||||
a :+ b
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
package org.broadinstitute.sting.scala
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
|
||||
|
||||
class ScalaCountLoci extends LocusWalker[Int,Int] {
|
||||
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = {
|
||||
return 1
|
||||
}
|
||||
|
||||
def reduceInit(): Int = {
|
||||
return 0;
|
||||
}
|
||||
|
||||
def reduce(m: Int, r: Int): Int = {
|
||||
return m + r;
|
||||
}
|
||||
|
||||
def main(args: Array[String]) {
|
||||
println("Hello, world!")
|
||||
}
|
||||
}
|
||||
|
|
@ -1,71 +0,0 @@
|
|||
package org.broadinstitute.sting.queue.pipeline
|
||||
|
||||
import org.testng.annotations.Test
|
||||
import org.broadinstitute.sting.BaseTest
|
||||
|
||||
class IPFLibraryPipelineTest {
|
||||
|
||||
@Test
|
||||
def testVCFExtractSites {
|
||||
var testOut = "vcfes.vcf"
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "vcfExtractSites"
|
||||
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSites -ivcf %s -out %s".format(
|
||||
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
|
||||
)
|
||||
spec.fileMD5s += testOut -> "4f496b8cf90302428a9edda486a337f4"
|
||||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
@Test
|
||||
def testVCFExtractSamples {
|
||||
var testOut = "vcf.extract.samples.vcf"
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "vcfExtractSamples"
|
||||
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSamples -ivcf %s -out %s -sm HG00107,HG00500,NA18501,NA18942".format(
|
||||
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
|
||||
)
|
||||
|
||||
spec.fileMD5s += testOut -> "180d5a2e7a1fbc5117de6705bde3a7c8"
|
||||
}
|
||||
|
||||
@Test
|
||||
def testVCFExtractIntervals {
|
||||
var testOut = "vcf.extract.intervals.list"
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "vcfExtractIntervals"
|
||||
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractIntervals -ivcf %s -out %s".format(
|
||||
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
|
||||
)
|
||||
|
||||
spec.fileMD5s += testOut ->"28589eeb28fac753577c027abf939345"
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testVCFSimpleMerge {
|
||||
var testOut = "vcf.simplemerge.vcf"
|
||||
val spec = new PipelineTestSpec
|
||||
val int1 = BaseTest.validationDataLocation + "omni.subset.interleaved.1.vcf"
|
||||
val int2 = BaseTest.validationDataLocation + "omni.subset.interleaved.2.vcf"
|
||||
spec.name = "vcfSimpleMerge"
|
||||
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFSimpleMerge -vcfs %s,%s -out %s -ref %s".format(
|
||||
int1,int2,testOut,BaseTest.b37KGReference
|
||||
)
|
||||
|
||||
spec.fileMD5s += testOut -> "c59b8de64ba787a2ca3a1734bdebf277"
|
||||
}
|
||||
|
||||
@Test
|
||||
def testSortByRef {
|
||||
var testOut = "vcf.refsorted.vcf"
|
||||
val spec = new PipelineTestSpec
|
||||
val unsorted = BaseTest.validationDataLocation + "omni.pos_sorted.vcf"
|
||||
spec.name = "sortByRef"
|
||||
spec.args = "-S scala/qscript/oneoffs/QTools.q -T SortByRef -ivcf %s -out %s -ref %s".format(
|
||||
unsorted, testOut, BaseTest.b37KGReference
|
||||
)
|
||||
|
||||
spec.fileMD5s += testOut -> "ee09af803bc94987d55d044c2ebbc0b8"
|
||||
}
|
||||
}
|
||||
|
|
@ -32,8 +32,6 @@ import java.util.Date
|
|||
import java.text.SimpleDateFormat
|
||||
import org.broadinstitute.sting.BaseTest
|
||||
import org.broadinstitute.sting.queue.QCommandLine
|
||||
import org.broadinstitute.sting.datasources.pipeline.{Pipeline, PipelineProject, PipelineSample}
|
||||
import org.broadinstitute.sting.utils.broad.PicardAggregationUtils
|
||||
import org.broadinstitute.sting.queue.util.{Logging, ProcessController}
|
||||
import java.io.{FileNotFoundException, File}
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportParser
|
||||
|
|
@ -42,23 +40,6 @@ import org.broadinstitute.sting.queue.engine.CommandLinePluginManager
|
|||
|
||||
object PipelineTest extends BaseTest with Logging {
|
||||
|
||||
case class K1gBam(squidId: String, sampleId: String, version: Int)
|
||||
|
||||
/** 1000G BAMs used for validation */
|
||||
val k1gBams = List(
|
||||
new K1gBam("C474", "NA19651", 2),
|
||||
new K1gBam("C474", "NA19655", 2),
|
||||
new K1gBam("C474", "NA19669", 2),
|
||||
new K1gBam("C454", "NA19834", 2),
|
||||
new K1gBam("C460", "HG01440", 2),
|
||||
new K1gBam("C456", "NA12342", 2),
|
||||
new K1gBam("C456", "NA12748", 2),
|
||||
new K1gBam("C474", "NA19649", 2),
|
||||
new K1gBam("C474", "NA19652", 2),
|
||||
new K1gBam("C474", "NA19654", 2))
|
||||
|
||||
validateK1gBams()
|
||||
|
||||
private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/"
|
||||
|
||||
val run = System.getProperty("pipeline.run") == "run"
|
||||
|
|
@ -92,49 +73,6 @@ object PipelineTest extends BaseTest with Logging {
|
|||
*/
|
||||
private def tempDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "temp/"
|
||||
|
||||
/**
|
||||
* Creates a new pipeline from a project.
|
||||
* @param project Pipeline project info.
|
||||
* @param samples List of samples.
|
||||
* @return a new pipeline project.
|
||||
*/
|
||||
def createPipeline(project: PipelineProject, samples: List[PipelineSample]) = {
|
||||
val pipeline = new Pipeline
|
||||
pipeline.setProject(project)
|
||||
pipeline.setSamples(samples)
|
||||
pipeline
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new pipeline project for hg19 with b37 132 dbsnp for genotyping, and b37 129 dbsnp for eval.
|
||||
* @param projectName Name of the project.
|
||||
* @param intervals The intervals file to use.
|
||||
* @return a new pipeline project.
|
||||
*/
|
||||
def createHg19Project(projectName: String, intervals: String) = {
|
||||
val project = new PipelineProject
|
||||
project.setName(projectName)
|
||||
project.setReferenceFile(new File(BaseTest.hg19Reference))
|
||||
project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
|
||||
project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
|
||||
project.setRefseqTable(new File(BaseTest.hg19Refseq))
|
||||
project.setIntervalList(new File(intervals))
|
||||
project
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a 1000G pipeline sample from one of the bams.
|
||||
* @param idPrefix Text to prepend to the sample name.
|
||||
* @param k1gBam bam to create the sample for.
|
||||
* @return the created pipeline sample.
|
||||
*/
|
||||
def createK1gSample(idPrefix: String, k1gBam: K1gBam) = {
|
||||
val sample = new PipelineSample
|
||||
sample.setId(idPrefix + "_" + k1gBam.sampleId)
|
||||
sample.setBamFiles(Map("cleaned" -> getPicardBam(k1gBam)))
|
||||
sample
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the pipelineTest.
|
||||
* @param pipelineTest test to run.
|
||||
|
|
@ -267,31 +205,6 @@ object PipelineTest extends BaseTest with Logging {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws an exception if any of the 1000G bams do not exist and warns if they are out of date.
|
||||
*/
|
||||
private def validateK1gBams() {
|
||||
var missingBams = List.empty[File]
|
||||
for (k1gBam <- k1gBams) {
|
||||
val latest = getLatestVersion(k1gBam)
|
||||
val bam = getPicardBam(k1gBam)
|
||||
if (k1gBam.version != latest)
|
||||
logger.warn("1000G bam is not the latest version %d: %s".format(latest, k1gBam))
|
||||
if (!bam.exists)
|
||||
missingBams :+= bam
|
||||
}
|
||||
if (missingBams.size > 0) {
|
||||
val nl = "%n".format()
|
||||
throw new FileNotFoundException("The following 1000G bam files are missing.%n%s".format(missingBams.mkString(nl)))
|
||||
}
|
||||
}
|
||||
|
||||
private def getPicardBam(k1gBam: K1gBam): File =
|
||||
new File(PicardAggregationUtils.getSampleBam(k1gBam.squidId, k1gBam.sampleId, k1gBam.version))
|
||||
|
||||
private def getLatestVersion(k1gBam: K1gBam): Int =
|
||||
PicardAggregationUtils.getLatestVersion(k1gBam.squidId, k1gBam.sampleId, k1gBam.version)
|
||||
|
||||
private var runningCommandLines = Set.empty[QCommandLine]
|
||||
|
||||
Runtime.getRuntime.addShutdownHook(new Thread {
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ class ExampleCountLociPipelineTest {
|
|||
val spec = new PipelineTestSpec
|
||||
spec.name = "countloci"
|
||||
spec.args = Array(
|
||||
" -S scala/qscript/examples/ExampleCountLoci.scala",
|
||||
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala",
|
||||
" -R " + BaseTest.hg18Reference,
|
||||
" -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam",
|
||||
" -o " + testOut).mkString
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ class HelloWorldPipelineTest {
|
|||
def testHelloWorld {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorld"
|
||||
spec.args = "-S scala/qscript/examples/HelloWorld.scala"
|
||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala"
|
||||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ class HelloWorldPipelineTest {
|
|||
def testHelloWorldWithPrefix {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithPrefix"
|
||||
spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPrefix HelloWorld"
|
||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld"
|
||||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ class HelloWorldPipelineTest {
|
|||
def testHelloWorldWithPriority {
|
||||
val spec = new PipelineTestSpec
|
||||
spec.name = "HelloWorldWithPriority"
|
||||
spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPriority 100"
|
||||
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100"
|
||||
PipelineTest.executeTest(spec)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue