Merge branch 'master' of ssh://delangel@nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2011-07-02 16:45:34 -04:00
commit c6c0dba040
64 changed files with 1106 additions and 2280 deletions

View File

@ -64,7 +64,7 @@
<property name="usefile" value="false" />
<!-- If running the 'package' task, this property controls the name of the xml file to package -->
<property name="package.xml.dir" value="packages" />
<property name="package.xml.dir" value="${public.dir}/packages" />
<property name="executable" value="GenomeAnalysisTK" />
<property environment="env"/>
@ -896,7 +896,7 @@
<!-- Build a package consisting of all supporting files -->
<target name="package" depends="dist,stage" description="bundle up an executable for distribution">
<mkdir dir="dist/packages" />
<xslt destdir="dist/packages" style="packages/CreatePackager.xsl" useImplicitFileset="false">
<xslt destdir="dist/packages" style="${package.xml.dir}/CreatePackager.xsl" useImplicitFileset="false">
<flattenmapper/>
<fileset dir="${package.xml.dir}">
<include name="${executable}.xml" />
@ -934,7 +934,7 @@
<!-- Use the packaging system to extract parts of picard-private we need -->
<mkdir dir="${dist.dir}/packages" />
<xslt in="packages/PicardPrivate.xml" out="${dist.dir}/packages/BuildPicardPrivate.xml" style="packages/CreatePackager.xsl" />
<xslt in="${package.xml.dir}/PicardPrivate.xml" out="${dist.dir}/packages/BuildPicardPrivate.xml" style="${package.xml.dir}/CreatePackager.xsl" />
<ant antfile="${dist.dir}/packages/BuildPicardPrivate.xml">
<property name="additional.jars" value="${required.picard.jars}" />
</ant>

View File

@ -0,0 +1,87 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
tranchesFile = args[1]
targetTITV = as.numeric(args[2])
targetSensitivity = as.numeric(args[3])
suppressLegend = ! is.na(args[4])
# -----------------------------------------------------------------------------------------------
# Useful general routines
# -----------------------------------------------------------------------------------------------
MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate
titvFPEst <- function(titvExpected, titvObserved) {
max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE)
}
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
nTPFP <- function(nVariants, FDR) {
return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100)))
}
leftShift <- function(x, leftValue = 0) {
r = rep(leftValue, length(x))
for ( i in 1:(length(x)-1) ) {
#print(list(i=i))
r[i] = x[i+1]
}
r
}
# -----------------------------------------------------------------------------------------------
# Tranches plot
# -----------------------------------------------------------------------------------------------
data2 = read.table(tranchesFile,sep=",",head=T)
data2 = data2[order(data2$novelTiTv, decreasing=F),]
#data2 = data2[order(data2$FDRtranche, decreasing=T),]
cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange")
density=c(20, -1, -1, 20)
outfile = paste(tranchesFile, ".pdf", sep="")
pdf(outfile, height=5, width=8)
par(mar = c(5, 5, 4, 2) + 0.1)
novelTiTv = c(data2$novelTITV,data2$novelTiTv)
alpha = 1 - titvFPEstV(targetTITV, novelTiTv)
#print(alpha)
numGood = round(alpha * data2$numNovel);
#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100))
numBad = data2$numNovel - numGood;
numPrevGood = leftShift(numGood, 0)
numNewGood = numGood - numPrevGood
numPrevBad = leftShift(numBad, 0)
numNewBad = numBad - numPrevBad
d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE)
#print(d)
barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000))
#abline(v= d[2,dim(d)[2]], lty=2)
#abline(v= d[1,3], lty=2)
if ( ! suppressLegend )
legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25)
mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1)
axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0)
axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0)
# plot sensitivity vs. specificity
sensitivity = data2$truthSensitivity
if ( ! is.null(sensitivity) ) {
#specificity = titvFPEstV(targetTITV, novelTiTv)
specificity = novelTiTv
plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)")
abline(h=targetTITV, lty=2)
abline(v=targetSensitivity, lty=2)
#text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2)
}
dev.off()

View File

@ -0,0 +1,108 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
covariateName = args[2]
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
c <- c[sort.list(c[,1]),]
#
# Plot residual error as a function of the covariate
#
d.good <- c[c$nBases >= 1000,]
d.1000 <- c[c$nBases < 1000,]
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(d.good$nBases) == length(c$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3))
}
# Don't let residual error go off the edge of the plot
d.good$residualError = d.good$Qempirical-d.good$Qreported
d.good$residualError[which(d.good$residualError > 10)] = 10
d.good$residualError[which(d.good$residualError < -10)] = -10
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
d.1000$residualError[which(d.1000$residualError > 10)] = 10
d.1000$residualError[which(d.1000$residualError < -10)] = -10
c$residualError = c$Qempirical-c$Qreported
c$residualError[which(c$residualError > 10)] = 10
c$residualError[which(c$residualError < -10)] = -10
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot mean quality versus the covariate
#
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
pointType = "p"
if( length(c$Covariate) <= 20 ) {
pointType = "o"
}
theTitle = paste("Quality By", covariateName);
if( is.numeric(c$Covariate) ) {
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
}
dev.off()
#
# Plot histogram of the covariate
#
e = d.good
f = d.1000
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
lwdSize=2
if( length(c$Covariate) <= 20 ) {
lwdSize=7
} else if( length(c$Covariate) <= 70 ) {
lwdSize=4
}
if( is.numeric(c$Covariate) ) {
if( length(hst$e.Covariate) == 0 ) {
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
} else {
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
if( length(hst$c.Covariate) > 9 ) {
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
} else {
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
}
axis(2,axTicks(2), format(axTicks(2), scientific=F))
}
dev.off()

View File

@ -0,0 +1,70 @@
#!/bin/env Rscript
args <- commandArgs(TRUE)
input = args[1]
Qcutoff = as.numeric(args[2])
maxQ = as.numeric(args[3])
maxHist = as.numeric(args[4])
t=read.table(input, header=T)
#
# Plot of reported quality versus empirical quality
#
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
f <- t[t$Qreported < Qcutoff,]
e <- rbind(d.good, d.1000, d.10000)
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
theTitle = paste("RMSE =", round(rmseAll,digits=3));
}
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
abline(0,1, lty=2)
dev.off()
#
# Plot Q empirical histogram
#
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
entropy = -sum(log2(percentBases)*percentBases)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()
#
# Plot Q reported histogram
#
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
yMax = max(hst$e.nBases)
if(maxHist != 0) {
yMax = maxHist
}
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -0,0 +1,138 @@
titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
titvFPEstV <- function(titvExpected, titvs) {
sapply(titvs, function(x) titvFPEst(titvExpected, x))
}
calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
2 * TP / 3 / callable
}
marginalTiTv <- function( nx, titvx, ny, titvy ) {
tvx = nx / (titvx + 1)
tix = nx - tvx
tvy = ny / (titvy + 1)
tiy = ny - tvy
tiz = tix - tiy
tvz = tvx - tvy
return(tiz / tvz)
}
marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
knownx = nx * dbx / 100
novelx = nx - knownx
knowny = ny * dby / 100
novely = ny - knowny
knownz = knownx - knowny
novelz = novelx - novely
return(knownz / ( knownz + novelz ) * 100)
}
numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
}
normalize <- function(x) {
x / sum(x)
}
normcumsum <- function(x) {
cumsum(normalize(x))
}
cumhist <- function(d, ...) {
plot(d[order(d)], type="b", col="orange", lwd=2, ...)
}
revcumsum <- function(x) {
return(rev(cumsum(rev(x))))
}
phred <- function(x) {
log10(max(x,10^(-9.9)))*-10
}
pOfB <- function(b, B, Q) {
#print(paste(b, B, Q))
p = 1 - 10^(-Q/10)
if ( b == B )
return(p)
else
return(1 - p)
}
pOfG <- function(bs, qs, G) {
a1 = G[1]
a2 = G[2]
log10p = 0
for ( i in 1:length(bs) ) {
b = bs[i]
q = qs[i]
p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
log10p = log10p + log10(p1)
}
return(log10p)
}
pOfGs <- function(nAs, nBs, Q) {
bs = c(rep("a", nAs), rep("t", nBs))
qs = rep(Q, nAs + nBs)
G1 = c("a", "a")
G2 = c("a", "t")
G3 = c("t", "t")
log10p1 = pOfG(bs, qs, G1)
log10p2 = pOfG(bs, qs, G2)
log10p3 = pOfG(bs, qs, G3)
Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
}
QsampleExpected <- function(depth, Q) {
weightedAvg = 0
for ( d in 1:(depth*3) ) {
Qsample = 0
pOfD = dpois(d, depth)
for ( nBs in 0:d ) {
pOfnB = dbinom(nBs, d, 0.5)
nAs = d - nBs
Qsample = pOfGs(nAs, nBs, Q)$Qsample
#Qsample = 1
weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
}
}
return(weightedAvg)
}
plotQsamples <- function(depths, Qs, Qmax) {
cols = rainbow(length(Qs))
plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
for ( i in 1:length(Qs) ) {
Q = Qs[i]
y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
points(depths, y, col=cols[i], type="b")
}
legend("topleft", paste("Q", Qs), fill=cols)
}
pCallHetGivenDepth <- function(depth, nallelesToCall) {
depths = 0:(2*depth)
pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
dpois(depths,depth)*(1-pNoAllelesToCall)
}
pCallHets <- function(depth, nallelesToCall) {
sum(pCallHetGivenDepth(depth,nallelesToCall))
}
pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
1-(1-pCallHets(depth,nallelesToCall))^nsamples
}

View File

@ -1,154 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.commandline.Argument;
import java.util.HashMap;
import java.util.Collection;
/**
* Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
*
* @author rpoplin
* @since Jan 15, 2010
* @help.summary Takes variant calls as .vcf files and creates plots of truth metrics as a function of the various annotations found in the INFO field.
*/
public class AnalyzeAnnotationsWalker extends RodWalker<Integer, Integer> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName = "output_prefix", shortName = "output", doc = "The output path and name to prepend to all plots and intermediate data files", required = false)
private String OUTPUT_PREFIX = "analyzeAnnotations/";
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
private String PATH_TO_RSCRIPT = "Rscript";
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
private String PATH_TO_RESOURCES = "R/";
@Argument(fullName = "min_variants_per_bin", shortName = "minBinSize", doc = "The minimum number of variants in a bin in order to calculate truth metrics.", required = false)
private int MIN_VARIANTS_PER_BIN = 1000;
@Argument(fullName = "max_variants_per_bin", shortName = "maxBinSize", doc = "The maximum number of variants in a bin.", required = false)
private int MAX_VARIANTS_PER_BIN = 20000;
@Argument(fullName = "sampleName", shortName = "sampleName", doc = "If supplied, only process variants found in this sample.", required = false)
private String SAMPLE_NAME = null;
@Argument(fullName = "name", shortName = "name", doc = "Labels for the annotations to make plots look nicer. Each name is a separate -name argument. For example, -name DP,Depth -name AB,AlleleBalance", required = false)
private String[] ANNOTATION_NAMES = null;
@Argument(fullName = "indicate_mean_num_vars", shortName = "meanNumVars", doc = "If supplied, plots will indicate the distribution of number of variants instead of distribution of value of annotation", required = false)
private boolean INDICATE_MEAN_NUM_VARS = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
private AnnotationDataManager dataManager;
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
// Create a HashMap associating the names of the annotations to full Strings that can be used as labels on plots
HashMap<String,String> nameMap = null;
if( ANNOTATION_NAMES != null ) {
nameMap = new HashMap<String,String>();
for( String nameLine : ANNOTATION_NAMES ) {
String[] vals = nameLine.split(",");
nameMap.put(vals[0],vals[1]);
}
}
dataManager = new AnnotationDataManager( nameMap, INDICATE_MEAN_NUM_VARS );
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
if( tracker != null ) {
Collection<VariantContext> VCs = tracker.getAllVariantContexts(ref);
// First find out if this variant is in the truth sets
boolean isInTruthSet = false;
boolean isTrueVariant = false;
for ( VariantContext vc : VCs ) {
if( vc != null && vc.isSNP() && !vc.isFiltered() ) {
if( vc.getSource().toUpperCase().startsWith("TRUTH") ) {
isInTruthSet = true;
if( vc.isBiallelic() && vc.isVariant() ) {
isTrueVariant = true;
}
}
}
}
// Add each annotation in this VCF Record to the dataManager
for ( VariantContext vc : VCs ) {
if( vc != null && vc.isSNP() && vc.isBiallelic() && !vc.isFiltered() ) {
if( !vc.getSource().toUpperCase().startsWith("TRUTH") ) {
if( vc.isVariant() ) {
dataManager.addAnnotations( vc, SAMPLE_NAME, isInTruthSet, isTrueVariant );
}
}
}
}
}
return 1; // This value isn't actually used for anything
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public Integer reduceInit() {
return 0; // Nothing to do here
}
public Integer reduce( Integer value, Integer sum ) {
return 0; // Nothing to do here
}
public void onTraversalDone( Integer sum ) {
// For each annotation, decide how to cut up the data, output intermediate cumulative p(true) tables, and call RScript to plot the tables
dataManager.plotCumulativeTables(PATH_TO_RSCRIPT, PATH_TO_RESOURCES, OUTPUT_PREFIX, MIN_VARIANTS_PER_BIN, MAX_VARIANTS_PER_BIN);
}
}

View File

@ -1,169 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.util.*;
import java.io.IOException;
import java.io.PrintStream;
import java.io.FileNotFoundException;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 18, 2010
*/
public class AnnotationDataManager {
private final HashMap<String, TreeSet<AnnotationDatum>> data;
private final HashMap<String,String> nameMap;
private final boolean INDICATE_MEAN_NUM_VARS;
public AnnotationDataManager( HashMap<String,String> _nameMap, boolean _INDICATE_MEAN_NUM_VARS ) {
data = new HashMap<String, TreeSet<AnnotationDatum>>();
nameMap = _nameMap;
INDICATE_MEAN_NUM_VARS = _INDICATE_MEAN_NUM_VARS;
}
public void addAnnotations( final VariantContext vc, final String sampleName, final boolean isInTruthSet, final boolean isTrueVariant ) {
if( sampleName != null ) { // Only process variants that are found in the sample with this sampleName
if( vc.getGenotype(sampleName).isNoCall() ) { // This variant isn't found in this sample so break out
return;
}
} // else, process all samples
// Loop over each annotation in the vcf record
final Map<String,Object> infoField = new HashMap<String, Object>(vc.getAttributes());
infoField.put("QUAL", ((Double)vc.getPhredScaledQual()).toString() ); // add QUAL field to annotations
for( Map.Entry<String, Object> annotation : infoField.entrySet() ) {
float value;
try {
value = Float.parseFloat( annotation.getValue().toString() );
} catch( NumberFormatException e ) {
continue; // Skip over annotations that aren't floats, like "DB"
}
TreeSet<AnnotationDatum> treeSet = data.get( annotation.getKey() );
if( treeSet == null ) { // This annotation hasn't been seen before
treeSet = new TreeSet<AnnotationDatum>( new AnnotationDatum() ); // AnnotationDatum is a Comparator that orders variants by the value of the Annotation
data.put( annotation.getKey(), treeSet );
}
AnnotationDatum datum = new AnnotationDatum( value );
if( treeSet.contains(datum) ) { // contains() uses AnnotationDatum's equals function, so it only checks if the value field is already present
datum = treeSet.tailSet(datum).first();
} else {
treeSet.add(datum);
}
final boolean isNovelVariant = !vc.hasID() || !vc.getID().contains("rs");
// Decide if the variant is a transition or transversion
if( VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0 ) {
datum.incrementTi( isNovelVariant, isInTruthSet, isTrueVariant );
} else {
datum.incrementTv( isNovelVariant, isInTruthSet, isTrueVariant );
}
}
}
public void plotCumulativeTables( final String PATH_TO_RSCRIPT, final String PATH_TO_RESOURCES, final String OUTPUT_PREFIX,
final int MIN_VARIANTS_PER_BIN, final int MAX_VARIANTS_PER_BIN ) {
final AnnotationDatum thisAnnotationBin = new AnnotationDatum();
System.out.println( "\nFinished reading variants into memory. Executing RScript commands:" );
// For each annotation we've seen
for( final String annotationKey : data.keySet() ) {
String filename = OUTPUT_PREFIX + annotationKey + ".dat";
PrintStream output;
try {
output = new PrintStream(filename); // Create the intermediate data file for this annotation
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(new File(filename), "Can't create intermediate output annotation data file. Does the output directory exist?", e);
}
// Output a header line
output.println("value\ttitv\tdbsnp\ttruePositive\tnumVariants\tcategory");
// Bin SNPs and calculate truth metrics for each bin
thisAnnotationBin.clearBin();
for( final AnnotationDatum datum : data.get( annotationKey ) ) {
thisAnnotationBin.combine( datum );
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) >= MAX_VARIANTS_PER_BIN ) { // This annotation bin is full
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
thisAnnotationBin.clearBin();
}
// else, continue accumulating variants because this bin isn't full yet
}
// One final bin that may not have been dumped out
if( thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) != 0 ) {
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.FULL_SET ) + "\t" + thisAnnotationBin.calcDBsnpRate() + "\t" + thisAnnotationBin.calcTPrate() +
"\t" + thisAnnotationBin.numVariants( AnnotationDatum.FULL_SET ) + "\tall");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.NOVEL_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.NOVEL_SET ) + "\tnovel");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.DBSNP_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.DBSNP_SET ) + "\tdbsnp");
output.println( thisAnnotationBin.value + "\t" + thisAnnotationBin.calcTiTv( AnnotationDatum.TRUTH_SET ) + "\t-1\t-1\t" + thisAnnotationBin.numVariants( AnnotationDatum.TRUTH_SET ) + "\ttruth");
thisAnnotationBin.clearBin();
}
// Close the PrintStream
output.close();
String annotationName = null;
if( nameMap != null ) {
annotationName = nameMap.get(annotationKey);
}
if( annotationName == null ) { // This annotation is not in the name map so use the key instead
annotationName = annotationKey;
}
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Annotations_BinnedTruthMetrics.R" + " " +
OUTPUT_PREFIX + annotationKey + ".dat" + " " + annotationName + " " + MIN_VARIANTS_PER_BIN + " " + INDICATE_MEAN_NUM_VARS;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new UserException.CannotExecuteRScript( rScriptCommandLine, e );
}
}
}
}

View File

@ -1,170 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.analyzeannotations;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Comparator;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Jan 18, 2010
*/
public class AnnotationDatum implements Comparator<AnnotationDatum> {
public float value;
private final int[] ti;
private final int[] tv;
public static final int FULL_SET = 0;
public static final int NOVEL_SET = 1;
public static final int DBSNP_SET = 2;
public static final int TRUTH_SET = 3;
public static final int TRUE_POSITIVE = 4;
private static final int NUM_SETS = 5;
public AnnotationDatum() {
value = 0.0f;
ti = new int[NUM_SETS];
tv = new int[NUM_SETS];
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
public AnnotationDatum( final float _value ) {
value = _value;
ti = new int[NUM_SETS];
tv = new int[NUM_SETS];
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
final public void incrementTi( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
ti[FULL_SET]++;
if( isNovelVariant ) {
ti[NOVEL_SET]++;
} else { // Is known, in DBsnp
ti[DBSNP_SET]++;
}
if( isInTruthSet ) {
ti[TRUTH_SET]++;
if( isTrueVariant ) {
ti[TRUE_POSITIVE]++;
}
}
}
final public void incrementTv( final boolean isNovelVariant, final boolean isInTruthSet, final boolean isTrueVariant ) {
tv[FULL_SET]++;
if( isNovelVariant ) {
tv[NOVEL_SET]++;
} else { // Is known, in DBsnp
tv[DBSNP_SET]++;
}
if( isInTruthSet ) {
tv[TRUTH_SET]++;
if( isTrueVariant ) {
tv[TRUE_POSITIVE]++;
}
}
}
final public void combine( final AnnotationDatum that ) {
for( int iii = 0; iii < NUM_SETS; iii++ ) {
this.ti[iii] += that.ti[iii];
this.tv[iii] += that.tv[iii];
}
this.value = that.value; // Overwrite this bin's value
}
final public float calcTiTv( final int INDEX ) {
if( ti[INDEX] < 0 || tv[INDEX] < 0 ) {
throw new ReviewedStingException( "Integer overflow detected! There are too many variants piled up in one annotation bin." );
}
if( tv[INDEX] == 0 ) { // Don't divide by zero
return 0.0f;
}
return ((float) ti[INDEX]) / ((float) tv[INDEX]);
}
final public float calcDBsnpRate() {
if( ti[FULL_SET] + tv[FULL_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return 100.0f * ((float) ti[DBSNP_SET] + tv[DBSNP_SET]) /
((float) ti[FULL_SET] + tv[FULL_SET]);
}
final public float calcTPrate() {
if( ti[TRUTH_SET] + tv[TRUTH_SET] == 0 ) { // Don't divide by zero
return 0.0f;
}
return 100.0f * ((float) ti[TRUE_POSITIVE] + tv[TRUE_POSITIVE]) /
((float) ti[TRUTH_SET] + tv[TRUTH_SET]);
}
final public int numVariants( final int INDEX ) {
return ti[INDEX] + tv[INDEX];
}
final public void clearBin() {
value = 0.0f;
for( int iii = 0; iii < NUM_SETS; iii++ ) {
ti[iii] = 0;
tv[iii] = 0;
}
}
public int compare( AnnotationDatum a1, AnnotationDatum a2 ) { // Function needed for this to be a Comparator
if( a1.value < a2.value ) { return -1; }
if( a1.value > a2.value ) { return 1; }
return 0;
}
public int equals( AnnotationDatum that ) { // Function needed for this to be sorted correctly in a TreeSet
if( this.value < that.value ) { return -1; }
if( this.value > that.value ) { return 1; }
return 0;
}
}

View File

@ -42,5 +42,5 @@ public class LowMQ implements InfoFieldAnnotation {
public List<String> getKeyNames() { return Arrays.asList("LowMQ"); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Integer, "3-tuple: <fraction of reads with MQ=0>,<fraction of reads with MQ<=10>,<total nubmer of reads>")); }
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 3, VCFHeaderLineType.Float, "3-tuple: <fraction of reads with MQ=0>,<fraction of reads with MQ<=10>,<total number of reads>")); }
}

View File

@ -1011,7 +1011,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
sb.append((char)b);
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
} else {
throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed");
throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows");
}
if ( reference.length - refIdx > 0 )

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.indels;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.commandline.Argument;
import java.io.File;
import java.util.*;
@By(DataSource.READS)
// walker to count realigned reads
public class RealignedReadCounter extends ReadWalker<Integer, Integer> {
public static final String ORIGINAL_CIGAR_TAG = "OC";
public static final String ORIGINAL_POSITION_TAG = "OP";
@Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
protected String intervalsFile = null;
// the intervals input by the user
private Iterator<GenomeLoc> intervals = null;
// the current interval in the list
private GenomeLoc currentInterval = null;
private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0;
private boolean intervalWasUpdated = false;
public void initialize() {
// prepare to read intervals one-by-one, as needed (assuming they are sorted).
intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY );
currentInterval = intervals.hasNext() ? intervals.next() : null;
}
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( currentInterval == null ) {
return 0;
}
GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read);
// hack to get around unmapped reads having screwy locations
if ( readLoc.getStop() == 0 )
readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) )
return 0;
if ( readLoc.overlapsP(currentInterval) ) {
if ( doNotTryToClean(read) )
return 0;
if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) {
String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG);
// deal with an old bug
if ( read.getCigar().toString().equals(newCigar) ) {
//System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar);
return 0;
}
if ( !intervalWasUpdated ) {
intervalWasUpdated = true;
updatedIntervals++;
affectedBases += 20 + getIndelSize(read);
}
updatedReads++;
}
} else {
do {
intervalWasUpdated = false;
currentInterval = intervals.hasNext() ? intervals.next() : null;
} while ( currentInterval != null && currentInterval.isBefore(readLoc) );
}
return 0;
}
private int getIndelSize(SAMRecord read) {
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.I )
return 0;
if ( ce.getOperator() == CigarOperator.D )
return ce.getLength();
}
logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar());
return 0;
}
private boolean doNotTryToClean(SAMRecord read) {
return read.getReadUnmappedFlag() ||
read.getNotPrimaryAlignmentFlag() ||
read.getReadFailsVendorQualityCheckFlag() ||
read.getMappingQuality() == 0 ||
read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ||
(BadMateFilter.hasBadMate(read));
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
System.out.println(updatedIntervals + " intervals were updated");
System.out.println(updatedReads + " reads were updated");
System.out.println(affectedBases + " bases were affected");
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.List;
import java.io.PrintStream;
/**
* Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but
* very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to.
* This was its very first use.
*/
public class CountIntervals extends RefWalker<Long, Long> {
@Output
PrintStream out;
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
int numOverlaps = 2;
public Long reduceInit() {
return 0l;
}
public boolean isReduceByInterval() { return true; }
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) {
return null;
}
List<GATKFeature> checkIntervals = tracker.getGATKFeatureMetaData("check",false);
return (long) checkIntervals.size();
}
public Long reduce(Long loc, Long prev) {
if ( loc == null ) {
return 0l;
} else {
return Math.max(prev,loc);
}
}
public void onTraversalDone(List<Pair<GenomeLoc,Long>> finalReduce) {
long count = 0;
for ( Pair<GenomeLoc,Long> g : finalReduce ) {
if ( g.second >= numOverlaps) {
count ++;
}
}
out.printf("Number of contiguous intervals: %d",count);
}
}

View File

@ -1,157 +0,0 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broad.tribble.readers.AsciiLineReader;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SimpleTimer;
import java.io.PrintStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.*;
/**
* Emits specific fields as dictated by the user from one or more VCF files.
*/
@Requires(value={})
public class ProfileRodSystem extends RodWalker<Integer, Integer> {
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false)
int nIterations = 1;
@Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false)
int MAX_RECORDS = -1;
SimpleTimer timer = new SimpleTimer("myTimer");
public void initialize() {
File rodFile = getRodFile();
out.printf("# walltime is in seconds%n");
out.printf("# file is %s%n", rodFile);
out.printf("# file size is %d bytes%n", rodFile.length());
out.printf("operation\titeration\twalltime%n");
for ( int i = 0; i < nIterations; i++ ) {
out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE));
out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE));
out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS));
out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC));
out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE));
}
timer.start(); // start up timer for map itself
}
private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE };
private final double readFile(File f, ReadMode mode) {
timer.start();
try {
byte[] data = new byte[100000];
FileInputStream s = new FileInputStream(f);
if ( mode == ReadMode.BY_BYTE ) {
while (true) {
if ( s.read(data) == -1 )
break;
}
} else {
int counter = 0;
VCFCodec codec = new VCFCodec();
String[] parts = new String[100000];
AsciiLineReader lineReader = new AsciiLineReader(s);
if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE )
codec.readHeader(lineReader);
while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) {
String line = lineReader.readLine();
if ( line == null )
break;
else if ( mode == ReadMode.BY_PARTS ) {
ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR);
}
else if ( mode == ReadMode.DECODE_LOC ) {
codec.decodeLoc(line);
}
else if ( mode == ReadMode.DECODE ) {
processOneVC((VariantContext)codec.decode(line));
}
}
}
} catch ( Exception e ) {
throw new RuntimeException(e);
}
return timer.getElapsedTime();
}
private File getRodFile() {
List<ReferenceOrderedDataSource> rods = this.getToolkit().getRodDataSources();
ReferenceOrderedDataSource rod = rods.get(0);
return rod.getFile();
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation());
processOneVC(vc);
return 0;
}
private static final void processOneVC(VariantContext vc) {
vc.getNSamples(); // force us to parse the samples
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer counter, Integer sum) {
return counter + sum;
}
public void onTraversalDone(Integer sum) {
out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime());
}
}

View File

@ -0,0 +1,153 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.*;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collection;
import java.util.List;
/**
* a walker for validating (in the style of validating pile-up) the ROD system.
*/
@Reference(window=@Window(start=-40,stop=40))
public class RodSystemValidationWalker extends RodWalker<Integer,Integer> {
// the divider to use in some of the text output
private static final String DIVIDER = ",";
@Output
public PrintStream out;
@Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts")
public boolean allRecordsVariantContextEquivalent = false;
// used to calculate the MD5 of a file
MessageDigest digest = null;
// we sometimes need to know what rods the engine's seen
List<ReferenceOrderedDataSource> rodList;
/**
* emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change
* underneath us).
*/
public void initialize() {
// setup the MD5-er
try {
digest = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new ReviewedStingException("Unable to find MD5 checksumer");
}
out.println("Header:");
// enumerate the list of ROD's we've loaded
rodList = this.getToolkit().getRodDataSources();
for (ReferenceOrderedDataSource rod : rodList) {
out.println(rod.getName() + DIVIDER + rod.getType());
out.println(rod.getName() + DIVIDER + rod.getFile());
out.println(rod.getName() + DIVIDER + md5sum(rod.getFile()));
}
out.println("Data:");
}
/**
*
* @param tracker the ref meta data tracker to get RODs
* @param ref reference context
* @param context the reads
* @return an 1 for each site with a rod(s), 0 otherwise
*/
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
int ret = 0;
if (tracker != null && tracker.getAllRods().size() > 0) {
out.print(context.getLocation() + DIVIDER);
Collection<GATKFeature> features = tracker.getAllRods();
for (GATKFeature feat : features)
out.print(feat.getName() + DIVIDER);
out.println(";");
ret++;
}
// if the argument was set, check for equivalence
if (allRecordsVariantContextEquivalent && tracker != null) {
Collection<VariantContext> col = tracker.getAllVariantContexts(ref);
VariantContext con = null;
for (VariantContext contextInList : col)
if (con == null) con = contextInList;
else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con);
}
return ret;
}
/**
* Provide an initial value for reduce computations.
*
* @return Initial value of reduce.
*/
@Override
public Integer reduceInit() {
return 0;
}
/**
* Reduces a single map with the accumulator provided as the ReduceType.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return accumulator with result of the map taken into account.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
@Override
public void onTraversalDone(Integer result) {
// Double check traversal result to make count is the same.
// TODO: Is this check necessary?
out.println("[REDUCE RESULT] Traversal result is: " + result);
}
// shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html
private String md5sum(File f) {
InputStream is;
try {
is = new FileInputStream(f);
} catch (FileNotFoundException e) {
return "Not a file";
}
byte[] buffer = new byte[8192];
int read = 0;
try {
while ((read = is.read(buffer)) > 0) {
digest.update(buffer, 0, read);
}
byte[] md5sum = digest.digest();
BigInteger bigInt = new BigInteger(1, md5sum);
return bigInt.toString(16);
}
catch (IOException e) {
throw new RuntimeException("Unable to process file for MD5", e);
}
finally {
try {
is.close();
}
catch (IOException e) {
throw new RuntimeException("Unable to close input stream for MD5 calculation", e);
}
}
}
}

View File

@ -1,298 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.Argument;
import java.io.PrintStream;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
*/
@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER)
@Reference(window=@Window(start=-5,stop=5))
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
public class ValidateBAQWalker extends ReadWalker<Integer, Integer> {
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(doc="maximum read length to apply the BAQ calculation too",required=false)
protected int maxReadLen = 1000;
@Argument(doc="",required=false)
protected int bw = 7;
@Argument(doc="",required=false)
protected boolean samtoolsMode = false;
@Argument(doc="only operates on reads with this name",required=false)
protected String readName = null;
@Argument(doc="If true, all differences are errors", required=false)
protected boolean strict = false;
@Argument(doc="prints info for each read", required=false)
protected boolean printEachRead = false;
@Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false)
protected boolean alsoPrintWarnings = false;
@Argument(doc="Include reads without BAQ tag", required=false)
protected boolean includeReadsWithoutBAQTag = false;
@Argument(doc="x each read is processed", required=false)
protected int magnification = 1;
@Argument(doc="Profile performance", required=false)
protected boolean profile = false;
int counter = 0;
BAQ baqHMM = null; // matches current samtools parameters
public void initialize() {
if ( samtoolsMode )
baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true);
else
baqHMM = new BAQ();
}
long goodReads = 0, badReads = 0;
public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) {
if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) {
if ( baqHMM.excludeReadFromBAQ(read) )
return 0;
if ( profile ) {
profileBAQ(ref, read);
} else {
validateBAQ(ref, read);
}
return 1;
}
return 0;
}
SimpleTimer tagTimer = new SimpleTimer("from.tag");
SimpleTimer baqReadTimer = new SimpleTimer("baq.read");
SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal");
private void profileBAQ(ReferenceContext ref, SAMRecord read) {
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
BAQ.BAQCalculationResult baq = null;
tagTimer.restart();
for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); }
tagTimer.stop();
baqReadTimer.restart();
for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); }
baqReadTimer.stop();
glocalTimer.restart();
for ( int i = 0; i < magnification; i++ )
baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY);
glocalTimer.stop();
}
private void validateBAQ(ReferenceContext ref, SAMRecord read) {
IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference();
byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag);
if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter);
BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader);
boolean fail = false;
boolean print = false;
int badi = 0;
if ( BAQ.hasBAQTag(read) ) {
for ( badi = 0; badi < baqFromTag.length; badi++ ) {
if ( baqFromTag[badi] != baq.bq[badi] ) {
if ( cigarLength(read) != read.getReadLength() ) {
print = true;
fail = false;
out.printf(" different, but cigar length != read length%n");
break;
}
if (MathUtils.arrayMin(read.getBaseQualities()) == 0) {
print = true;
fail = strict;
out.printf(" different, but Q0 base detected%n");
break;
}
else if (readHasSoftClip(read) && ! samtoolsMode) {
print = true;
fail = strict;
out.printf(" different, but soft clip detected%n");
break;
} else if (readHasDeletion(read) ) { // && ! samtoolsMode) {
print = true;
fail = strict;
out.printf(" different, but deletion detected%n");
break;
} else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) {
print = fail = true;
out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual());
break;
} else {
print = fail = true;
break;
}
}
}
if ( fail || print )
badReads++;
else
goodReads++;
}
if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) {
byte[] pos = new byte[baq.bq.length];
for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i;
out.printf(" read length : %d%n", read.getReadLength());
out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart());
out.printf(" cigar : %s%n", read.getCigarString());
out.printf(" ref bases : %s%n", new String(baq.refBases));
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.printf(" ref length : %d%n", baq.refBases.length);
out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG));
if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true);
printQuals(" original quals: ", read.getBaseQualities(), true);
printQuals(" baq quals: ", baq.bq, true);
printQuals(" positions : ", pos, true);
printQuals(" original quals: ", read.getBaseQualities());
if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag);
printQuals(" hmm quals: ", baq.bq);
out.printf(" read bases : %s%n", new String(read.getReadBases()));
out.println(Utils.dupString('-', 80));
}
if ( fail )
throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d",
read.getReadName(), badi, baqFromTag[badi], baq.bq[badi]));
}
private final static boolean readHasSoftClip(SAMRecord read) {
for (CigarElement e : read.getCigar().getCigarElements()) {
if ( e.getOperator() == CigarOperator.SOFT_CLIP )
return true;
}
return false;
}
private final static boolean readHasDeletion(SAMRecord read) {
for (CigarElement e : read.getCigar().getCigarElements()) {
if ( e.getOperator() == CigarOperator.DELETION )
return true;
}
return false;
}
public final void printQuals( String prefix, byte[] quals ) {
printQuals(prefix, quals, false);
}
public final void printQuals( String prefix, byte[] quals, boolean asInt ) {
printQuals(out, prefix, quals, asInt);
}
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
out.print(prefix);
for ( int i = 0; i < quals.length; i++) {
if ( asInt ) {
out.printf("%2d", (int)quals[i]);
if ( i+1 != quals.length ) out.print(",");
} else
out.print((char)(quals[i]+33));
}
out.println();
}
/**
* Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present.
* @param read
* @return
*/
public static byte[] getBAQDeltas(SAMRecord read) {
byte[] baq = BAQ.getBAQTag(read);
if ( baq != null ) {
byte[] deltas = new byte[baq.length];
for ( int i = 0; i < deltas.length; i++)
deltas[i] = (byte)(-1 * (baq[i] - 64));
return deltas;
} else
return null;
}
private int cigarLength(SAMRecord read) {
int readI = 0;
for ( CigarElement elt : read.getCigar().getCigarElements() ) {
int l = elt.getLength();
switch (elt.getOperator()) {
case N: // cannot handle these
return 0;
case H : case P : // ignore pads and hard clips
break;
case S :
case I :
readI += l;
break;
case D : break;
case M :
readI += l;
break;
default:
throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName());
}
}
return readI;
}
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
public void onTraversalDone(Integer nreads) {
if ( profile ) {
out.printf("n.reads baq.per.read calculation time.in.secs%n");
printTimer(nreads, tagTimer);
printTimer(nreads, glocalTimer);
printTimer(nreads, baqReadTimer);
} else {
out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads,
goodReads, (100.0 * goodReads) / nreads,
badReads, (100.0 * badReads) / nreads);
}
}
private final void printTimer(int nreads, SimpleTimer timer) {
out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime());
}
}

View File

@ -55,7 +55,6 @@ import java.util.*;
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Inputs
/////////////////////////////
@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
final private List<Tranche> tranches = new ArrayList<Tranche>();
final private Set<String> inputNames = new HashSet<String>();
final private NestedHashMap lodMap = new NestedHashMap();
final private NestedHashMap annotationMap = new NestedHashMap();
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
//---------------------------------------------------------------------------------------------------------------
@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
final TreeSet<String> samples = new TreeSet<String>();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
for ( final String line : new XReadLines( RECAL_FILE ) ) {
final String[] vals = line.split(",");
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
String filterString = null;
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
if( lod == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
}
// Annotate the new record with its VQSLOD and the worst performing annotation
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);
if( lod >= tranche.minVQSLod ) {

View File

@ -190,6 +190,19 @@ public class GaussianMixtureModel {
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
if(datum.isNull[iii]) { return null; }
final Normal normal = new Normal(0.0, 1.0, null);
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
}
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numVals = 0;
double sumPVarInGaussian = 0.0;

View File

@ -254,7 +254,9 @@ public class VariantDataManager {
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod));
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
datum.contig, datum.start, datum.stop, datum.lod,
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
}

View File

@ -24,6 +24,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
public String contig;
public int start;
public int stop;
public int worstAnnotation;
public MultivariateGaussian assignment; // used in K-means implementation
public int compareTo( final VariantDatum other ) {

View File

@ -60,6 +60,7 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
public static final String VQS_LOD_KEY = "VQSLOD";
public static final String CULPRIT_KEY = "culprit";
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
engine.evaluateData( dataManager.getData(), goodModel, false );
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
engine.evaluateData( dataManager.getData(), badModel, true );
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );

View File

@ -57,6 +57,23 @@ public class VariantRecalibratorEngine {
}
}
public void calculateWorstPerformingAnnotation( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
for( final VariantDatum datum : data ) {
int worstAnnotation = -1;
double minProb = Double.MAX_VALUE;
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
if( goodProbLog10 != null && badProbLog10 != null ) {
final double prob = goodProbLog10 - badProbLog10;
if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
}
}
datum.worstAnnotation = worstAnnotation;
}
}
/////////////////////////////
// Private Methods used for generating a GaussianMixtureModel
/////////////////////////////

View File

@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.datasources.pipeline;
package org.broadinstitute.sting.pipeline;
import java.util.ArrayList;
import java.util.List;

View File

@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.datasources.pipeline;
package org.broadinstitute.sting.pipeline;
import java.io.File;
import java.util.Map;

View File

@ -22,7 +22,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.datasources.pipeline;
package org.broadinstitute.sting.pipeline;
import java.io.File;
import java.util.Map;

View File

@ -1,149 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.broad;
import org.apache.commons.io.filefilter.RegexFileFilter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.util.Arrays;
public class PicardAggregationUtils {
public static final String PICARD_AGGREGATION_DIR = "/seq/picard_aggregation/";
/**
* Returns the path to the sample BAM.
* @param project Project
* @param sample Sample
* @param version Version
* @return The path to the sample BAM.
*/
public static String getSampleBam(String project, String sample, int version) {
return getSampleDir(project, sample, version) + sample + ".bam";
}
/**
* Returns the path to the latest BAM.
* @param project Project
* @param sample Sample
* @return The path to the latest BAM.
* @throws FileNotFoundException If a finished directory cannot be found for a sample.
*/
public static String getSampleBam(String project, String sample) throws FileNotFoundException {
return getSampleDir(project, sample) + sample + ".bam";
}
/**
* Returns the sample directory.
* @param project Project
* @param sample Sample
* @param version Version
* @return the sample directory.
*/
public static String getSampleDir(String project, String sample, int version) {
return PICARD_AGGREGATION_DIR + String.format("%s/%s/v%d/", project, sample, version);
}
/**
* Returns the latest finished directory for this project sample.
* @param project Project
* @param sample Sample
* @return The path to the latest finished directory.
* @throws FileNotFoundException If a finished directory cannot be found for a sample.
*/
public static String getSampleDir(String project, String sample) throws FileNotFoundException {
int latestVersion = getLatestVersion(project, sample);
if (latestVersion == 0)
throw new FileNotFoundException("Unable to find a finished directory for project sample " + project + "/" + sample);
return getSampleDir(project, sample, latestVersion);
}
/**
* Returns the latest finished version directory.
* Because isilon metadata operations are relatively slow this method
* tries not to look for every possible versioned directory.
* @param project Project
* @param sample Sample
* @return The highest finished version directory or 0 if a finished directory was not found.
*/
public static int getLatestVersion(String project, String sample) {
return getLatestVersion(project, sample, 0);
}
/**
* Returns the latest finished version directory after startVersion.
* Because isilon metadata operations are relatively slow this method
* tries not to look for every possible versioned directory.
* @param project Project
* @param sample Sample
* @param startVersion minimum version to return
* @return The highest finished version directory after startVersion
*/
public static int getLatestVersion(String project, String sample, int startVersion) {
int version = Math.max(0, startVersion);
boolean nextExists = true;
while (nextExists) {
nextExists = false;
for (int next = 3; next > 0; next--)
if (isFinished(project, sample, version + next)) {
version += next;
nextExists = true;
break;
}
}
// Special case when the version is 0
// Because isilon storage takes a while to do meta data operations only look through every file if we have to.
if (version == 0) {
File sampleDir = new File(PICARD_AGGREGATION_DIR + project + "/" + sample);
if (sampleDir.exists()) {
FileFilter filter = new RegexFileFilter("v\\d+");
File[] files = sampleDir.listFiles(filter);
int[] versions = new int[files.length];
for (int i = 0; i < files.length; i++)
versions[i] = Integer.parseInt(files[i].getName().substring(1));
Arrays.sort(versions);
for (int i = versions.length - 1; i >= 0; i--) {
if (isFinished(project, sample, versions[i])) {
version = versions[i];
break;
}
}
}
}
return version == 0 ? startVersion : version;
}
/**
* Returns true if the project sample directory contains a finished.txt
* @param project Project
* @param sample Sample
* @param version Version
* @return true if the project sample directory contains a finished.txt
*/
public static boolean isFinished(String project, String sample, int version) {
return new File(getSampleDir(project, sample, version), "finished.txt").exists();
}
}

View File

@ -1,108 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.broad;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class PicardAnalysisFiles {
private static final String REFERENCE_SEQUENCE_HEADER = "REFERENCE_SEQUENCE";
private static final String TARGET_INTERVALS_HEADER = "TARGET_INTERVALS";
private static final String BAIT_INTERVALS_HEADER = "BAIT_INTERVALS";
private static final String[] ANALYSIS_HEADERS = {REFERENCE_SEQUENCE_HEADER, TARGET_INTERVALS_HEADER, BAIT_INTERVALS_HEADER};
private static final String ANALYSIS_FILES = "analysis_files.txt";
private String path;
private Map<String,Set<String>> headerValues = new HashMap<String,Set<String>>();
public PicardAnalysisFiles(String project, String sample) throws FileNotFoundException {
this(PicardAggregationUtils.getSampleDir(project, sample) + ANALYSIS_FILES);
}
public PicardAnalysisFiles(String project, String sample, int version) throws FileNotFoundException {
this(PicardAggregationUtils.getSampleDir(project, sample, version) + ANALYSIS_FILES);
}
public PicardAnalysisFiles(String path) throws FileNotFoundException {
this.path = path;
HashMap<String,Integer> headerIndexes = null;
for (String line: new XReadLines(new File(path))) {
if (line.startsWith("#"))
continue;
String[] values = line.split("\t");
if (headerIndexes == null) {
headerIndexes = new HashMap<String,Integer>();
for (String header: ANALYSIS_HEADERS) {
headerIndexes.put(header, ArrayUtils.indexOf(values, header));
headerValues.put(header, new HashSet<String>());
}
} else {
for (String header: ANALYSIS_HEADERS) {
int index = headerIndexes.get(header);
if (values.length <= index)
throw new StingException(String.format("Unable to parse line in %s: %n%s", path, line));
String value = values[index];
headerValues.get(header).add(value);
}
}
}
}
public String getPath() {
return path;
}
public String getReferenceSequence() {
return getSingle(REFERENCE_SEQUENCE_HEADER);
}
public String getTargetIntervals() {
return getSingle(TARGET_INTERVALS_HEADER);
}
public String getBaitIntervals() {
return getSingle(BAIT_INTERVALS_HEADER);
}
private String getSingle(String header) {
Set<String> values = headerValues.get(header);
if (values.size() > 1) {
throw new UnsupportedOperationException(path + " contains more than one value for " + header + ": " + values);
} else if (values.size() == 0) {
return null;
} else {
String value = values.iterator().next();
return "null".equals(value) ? null : value;
}
}
}

View File

@ -1,123 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.broad;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.NullArgumentException;
import org.broadinstitute.sting.datasources.pipeline.Pipeline;
import org.broadinstitute.sting.datasources.pipeline.PipelineProject;
import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.yaml.YamlUtils;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Automatically gets the latest version using PicardAggregationUtils.
*/
public class PicardPipeline {
protected static final String PROJECT_TAG = "SQUIDProject";
protected static final String SAMPLE_TAG = "CollaboratorID";
protected static final String PICARD_BAM_TYPE = "cleaned";
private PicardPipeline() {}
/**
* Creates a new PicardPipeline
* @param path Path to a tsv with project [tab] sample on each line or a pipeline yaml.
* @return a new Picard
* @throws FileNotFoundException when unable to find the file or any supporting files.
*/
public static Pipeline parse(File path) throws FileNotFoundException {
if (path == null)
throw new NullArgumentException("path");
Pipeline pipeline;
if (path.getName().endsWith(".tsv")) {
pipeline = new Pipeline();
pipeline.getProject().setName(FilenameUtils.getBaseName(path.getPath()));
for (String line: new XReadLines(path)) {
String[] projectSample = line.split("\t");
addSample(pipeline, projectSample[0], projectSample[1]);
}
} else if (path.getName().endsWith(".yaml")) {
pipeline = YamlUtils.load(Pipeline.class, path);
} else {
throw new UserException.BadInput("Path does not end with .tsv or .yaml: " + path.getPath());
}
update(pipeline);
return pipeline;
}
private static void update(Pipeline pipeline) throws FileNotFoundException {
for (PipelineSample sample: pipeline.getSamples())
updateSample(pipeline.getProject(), sample);
}
private static void addSample(Pipeline pipeline, String project, String sample) {
PipelineSample pipelineSample = new PipelineSample();
pipelineSample.getTags().put(PROJECT_TAG, project);
pipelineSample.getTags().put(SAMPLE_TAG, sample);
pipeline.getSamples().add(pipelineSample);
}
private static void updateSample(PipelineProject pipelineProject, PipelineSample pipelineSample) throws FileNotFoundException {
if (!pipelineSample.getTags().containsKey(PROJECT_TAG) && !pipelineSample.getTags().containsKey(SAMPLE_TAG))
return;
String project = pipelineSample.getTags().get(PROJECT_TAG);
String sample = pipelineSample.getTags().get(SAMPLE_TAG);
int version = PicardAggregationUtils.getLatestVersion(project, sample);
if (version <= 0)
throw new UserException.BadInput("Project sample not found: " + project + "/" + sample);
String bam = PicardAggregationUtils.getSampleBam(project, sample, version);
if (pipelineSample.getId() == null)
pipelineSample.setId(project + "_" + sample);
pipelineSample.getBamFiles().put(PICARD_BAM_TYPE, new File(bam));
PicardAnalysisFiles analysis = new PicardAnalysisFiles(project, sample, version);
if (pipelineProject.getReferenceFile() == null) {
String referenceSequence = analysis.getReferenceSequence();
ReferenceData referenceData = ReferenceData.getByReference(referenceSequence);
pipelineProject.setReferenceFile(new File(referenceData.getReference()));
pipelineProject.setRefseqTable(new File(referenceData.getRefseq()));
if (analysis.getTargetIntervals() != null)
pipelineProject.setIntervalList(new File(analysis.getTargetIntervals()));
pipelineProject.setEvalDbsnp(new File(referenceData.getDbsnp(129)));
if (referenceData.getDbsnpVersions().contains(132)) {
pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(132)));
} else {
pipelineProject.setGenotypeDbsnp(new File(referenceData.getDbsnp(129)));
}
} else {
String referenceSequence = analysis.getReferenceSequence();
if (!pipelineProject.getReferenceFile().getPath().equals(referenceSequence))
throw new UserException.BadInput("Samples sequenced with different references");
}
}
}

View File

@ -1,135 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.broad;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
/**
* Tracks data related to reference files at the Broad.
*/
public enum ReferenceData {
/**
* HG18 reference data
*/
HG18("hg18"),
/**
* HG19 reference data
*/
HG19("hg19");
private static final String REFSEQ_DIR = "/humgen/gsa-hpprojects/GATK/data/Annotations/refseq/";
private static final String DBSNP_DIR = "/humgen/gsa-hpprojects/GATK/data/";
private final String name;
private final String reference;
private final String refseq;
private final Map<Integer,String> dbsnps;
ReferenceData(String name) {
this.name = name;
Map<Integer,String> dbsnps = new TreeMap<Integer,String>();
if ("hg18".equals(name)) {
this.reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
this.refseq = REFSEQ_DIR + "refGene-big-table-hg18.txt";
dbsnps.put(129, DBSNP_DIR + "dbsnp_129_hg18.rod");
} else if ("hg19".equals(name)) {
this.reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
this.refseq = REFSEQ_DIR + "refGene-big-table-hg19.txt";
dbsnps.put(129, DBSNP_DIR + "dbsnp_129_b37.vcf");
dbsnps.put(132, DBSNP_DIR + "dbsnp_132_b37.vcf");
} else
throw new UnsupportedOperationException("Unknown reference: " + name);
this.dbsnps = Collections.unmodifiableMap(dbsnps);
}
/**
* Returns the name of the reference.
* @return the name of the reference.
*/
public String getName() {
return name;
}
/**
* Returns the path to the fasta.
* @return the path to the fasta.
*/
public String getReference() {
return reference;
}
/**
* Returns the path to the refseq table.
* @return the path to the refseq table.
*/
public String getRefseq() {
return refseq;
}
/**
* Returns the dbsnp versions available.
* @return the dbsnp versions available.
*/
public Set<Integer> getDbsnpVersions() {
return dbsnps.keySet();
}
/**
* Returns the dbsnp path for the version.
* @param version version from getDbsnpVersions()
* @return the dbsnp path for the version.
*/
public String getDbsnp(int version) {
return dbsnps.get(version);
}
/**
* Returns the dbsnp type for the version, "VCF" or "DBSNP".
* @param version version from getDbsnpVersions()
* @return the dbsnp type for the version, "VCF" or "DBSNP".
*/
public String getDbsnpType(int version) {
String dbsnp = getDbsnp(version);
if (dbsnp == null)
return null;
return dbsnp.toLowerCase().endsWith(".vcf") ? "VCF" : "DBSNP";
}
/**
* Returns the reference data based on the path or null.
* @param reference path to the reference
* @return the reference data based on the path or null.
*/
public static ReferenceData getByReference(String reference) {
for (ReferenceData data: ReferenceData.values())
if (data.reference.equals(reference))
return data;
return null;
}
}

View File

@ -68,13 +68,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
ReferenceSequence seq = null;
}
private static ThreadLocal<Cache> cache;
private ThreadLocal<Cache> cache;
static {
{
resetThreadLocalCache();
};
}
protected static void resetThreadLocalCache() {
protected void resetThreadLocalCache() {
cache = new ThreadLocal<Cache> () {
@Override protected Cache initialValue() {
return new Cache();

View File

@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
* @return vc subcontext
*/
public VariantContext subContextFromGenotypes(Collection<Genotype> genotypes, Set<Allele> alleles) {
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes());
return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes());
}

View File

@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
"-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s",
1,
Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e"));
Arrays.asList("f23ba17ee0f9573dd307708175d90cd2"));
executeTest("test dbsnp", spec2);
WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec(

View File

@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
"d33212a84368e821cbedecd4f59756d6", // tranches
"a35cd067f378442eee8cd5edeea92be0", // recal file
"126d52843f4a57199ee97750ffc16a07"); // cut VCF
"4652dca41222bebdf9d9fda343b2a835", // recal file
"5350b1a4c1250cf3b77ca45327c04711"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {

View File

@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" --NO_HEADER" +
" -o %s",
1,
Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877")
Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d")
);
executeTest("testSimpleVCFStreaming", spec);
@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
" -EV CompOverlap -noEV -noST" +
" -o %s",
1,
Arrays.asList("f60729c900bc8368717653b3fad80d1e")
Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e"
);
executeTest("testVCFStreamingChain", selectTestSpec);

View File

@ -22,8 +22,10 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.datasources.pipeline;
package org.broadinstitute.sting.pipeline;
import org.broadinstitute.sting.pipeline.Pipeline;
import org.broadinstitute.sting.pipeline.PipelineSample;
import org.testng.Assert;
import org.broadinstitute.sting.utils.yaml.YamlUtils;

View File

@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.BeforeMethod;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.Utils;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.io.PrintStream;
import java.util.List;
import java.util.ArrayList;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.*;
/**
@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest {
System.out.println(Utils.dupString('-', 40));
System.out.println("reads : " + new String(test.readBases));
ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false);
ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false);
printQuals(System.out, "in-quals:", test.quals, false);
printQuals(System.out, "bq-quals:", result.bq, false);
for (int i = 0; i < test.quals.length; i++) {
//result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i);
Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality");
@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest {
}
}
public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) {
out.print(prefix);
for ( int i = 0; i < quals.length; i++) {
if ( asInt ) {
out.printf("%2d", (int)quals[i]);
if ( i+1 != quals.length ) out.print(",");
} else
out.print((char)(quals[i]+33));
}
out.println();
}
}

View File

@ -1,61 +0,0 @@
package org.broadinstitute.sting.utils.broad;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.FileNotFoundException;
public class PicardAggregationUtilsUnitTest {
public static final String PROJECT = "C474";
public static final String SAMPLE = "NA19651";
public static final String MISSING_PROJECT = "C0";
public static final String MISSING_SAMPLE = "0";
private int latestVersion = -1;
@Test
public void testGetLatestVersion() {
latestVersion = PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE);
System.out.println(String.format("Latest version for %s %s is %d", PROJECT, SAMPLE, latestVersion));
Assert.assertTrue(latestVersion > 0);
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE, latestVersion), latestVersion);
}
@Test(dependsOnMethods = "testGetLatestVersion")
public void testGetSampleBam() throws Exception {
String test = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE);
String latest = PicardAggregationUtils.getSampleBam(PROJECT, SAMPLE, latestVersion);
Assert.assertEquals(test, latest);
}
@Test(dependsOnMethods = "testGetLatestVersion")
public void testGetSampleDir() throws Exception {
String test = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE);
String latest = PicardAggregationUtils.getSampleDir(PROJECT, SAMPLE, latestVersion);
Assert.assertEquals(test, latest);
}
@Test(dependsOnMethods = "testGetLatestVersion")
public void testIsFinished() {
Assert.assertTrue(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion));
Assert.assertFalse(PicardAggregationUtils.isFinished(PROJECT, SAMPLE, latestVersion + 1));
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testMissingSampleBam() throws Exception {
PicardAggregationUtils.getSampleBam(MISSING_PROJECT, MISSING_SAMPLE);
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testMissingSampleDir() throws Exception {
PicardAggregationUtils.getSampleDir(MISSING_PROJECT, MISSING_SAMPLE);
}
@Test
public void testLatestVersionMissing() {
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE), 0);
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, -1), -1);
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 0), 0);
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 1), 1);
Assert.assertEquals(PicardAggregationUtils.getLatestVersion(MISSING_PROJECT, MISSING_SAMPLE, 2), 2);
}
}

View File

@ -1,56 +0,0 @@
package org.broadinstitute.sting.utils.broad;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.FileNotFoundException;
import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
public class PicardAnalysisFilesUnitTest extends BaseTest {
@Test
public void testParseLatest() throws Exception {
PicardAnalysisFiles files = new PicardAnalysisFiles(PROJECT, SAMPLE);
Assert.assertNotNull(files.getPath());
files = new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE));
Assert.assertNotNull(files.getPath());
}
@Test
public void testParseValid() throws Exception {
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file.txt");
Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
}
@Test
public void testParseValidWithComments() throws Exception {
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_comments.txt");
Assert.assertEquals(file.getReferenceSequence(), "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta");
Assert.assertEquals(file.getTargetIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list");
Assert.assertEquals(file.getBaitIntervals(), "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list");
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testParseBadPath() throws Exception {
new PicardAnalysisFiles(BaseTest.validationDataLocation + "non_existent_picard_analysis_file.txt");
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testParseMissingLatest() throws Exception {
new PicardAnalysisFiles(MISSING_PROJECT, MISSING_SAMPLE);
}
@Test(expectedExceptions = FileNotFoundException.class)
public void testParseMissingVersion() throws Exception {
new PicardAnalysisFiles(PROJECT, SAMPLE, PicardAggregationUtils.getLatestVersion(PROJECT, SAMPLE) + 2);
}
@Test(expectedExceptions = UnsupportedOperationException.class)
public void testParseMultipleReferences() throws Exception {
PicardAnalysisFiles file = new PicardAnalysisFiles(BaseTest.validationDataLocation + "picard_analysis_file_with_different_references.txt");
file.getReferenceSequence();
}
}

View File

@ -1,71 +0,0 @@
package org.broadinstitute.sting.utils.broad;
import org.testng.Assert;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.datasources.pipeline.Pipeline;
import org.broadinstitute.sting.datasources.pipeline.PipelineSample;
import org.broadinstitute.sting.utils.yaml.YamlUtils;
import org.testng.annotations.Test;
import static org.broadinstitute.sting.utils.broad.PicardAggregationUtilsUnitTest.*;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
public class PicardPipelineUnitTest {
@Test
public void testParseTsv() throws IOException {
File tsv = writeTsv(PROJECT, SAMPLE);
Pipeline pipeline = PicardPipeline.parse(tsv);
validatePipeline(pipeline, FilenameUtils.getBaseName(tsv.getPath()));
}
@Test
public void testParseTsvWithPicardComments() throws Exception {
File tsv = writeTsv("C460", "HG01359");
PicardPipeline.parse(tsv);
}
@Test
public void testParseYaml() throws IOException {
File yaml = writeYaml("project_name", PROJECT, SAMPLE);
Pipeline pipeline = PicardPipeline.parse(yaml);
validatePipeline(pipeline, "project_name");
}
private void validatePipeline(Pipeline pipeline, String name) {
Assert.assertEquals(pipeline.getProject().getName(), name);
Assert.assertTrue(pipeline.getProject().getReferenceFile().exists(), "reference not found");
Assert.assertTrue(pipeline.getProject().getIntervalList().exists(), "intervals not found");
Assert.assertTrue(pipeline.getProject().getRefseqTable().exists(), "refseq not found");
Assert.assertTrue(pipeline.getProject().getGenotypeDbsnp().exists(), "genotype dbsnp not found");
Assert.assertTrue(pipeline.getProject().getEvalDbsnp().exists(), "eval dbsnp not found");
Assert.assertEquals(pipeline.getSamples().size(), 1);
for (PipelineSample sample: pipeline.getSamples()) {
Assert.assertEquals(sample.getId(), PROJECT + "_" + SAMPLE);
Assert.assertTrue(sample.getBamFiles().get(PicardPipeline.PICARD_BAM_TYPE).exists(), "bam not found");
Assert.assertEquals(sample.getTags().get(PicardPipeline.PROJECT_TAG), PROJECT);
Assert.assertEquals(sample.getTags().get(PicardPipeline.SAMPLE_TAG), SAMPLE);
}
}
private File writeTsv(String project, String sample) throws IOException {
File tsv = BaseTest.createTempFile("pipeline", ".tsv");
FileUtils.writeLines(tsv, Collections.singletonList(project + "\t" + sample));
return tsv;
}
private File writeYaml(String projectName, String project, String sample) throws IOException {
File yaml = BaseTest.createTempFile("pipeline", ".yaml");
PipelineSample pipelineSample = new PipelineSample();
pipelineSample.getTags().put(PicardPipeline.PROJECT_TAG, project);
pipelineSample.getTags().put(PicardPipeline.SAMPLE_TAG, sample);
Pipeline pipeline = new Pipeline();
pipeline.getProject().setName(projectName);
pipeline.getSamples().add(pipelineSample);
YamlUtils.dump(pipeline, yaml);
return yaml;
}
}

View File

@ -1,49 +0,0 @@
package org.broadinstitute.sting.utils.broad;
import org.broadinstitute.sting.BaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
public class ReferenceDataUnitTest {
@Test
public void testNames() {
Assert.assertEquals(ReferenceData.HG18.getName(), "hg18");
Assert.assertEquals(ReferenceData.HG19.getName(), "hg19");
}
@Test
public void testFilesExist() {
for (ReferenceData data: ReferenceData.values()) {
Assert.assertTrue(new File(data.getReference()).exists());
Assert.assertTrue(new File(data.getRefseq()).exists());
for (int version: data.getDbsnpVersions()) {
Assert.assertTrue(new File(data.getDbsnp(version)).exists());
}
}
}
@Test
public void testDbsnps() {
Assert.assertTrue(new File(ReferenceData.HG18.getDbsnp(129)).exists());
Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(129)).exists());
Assert.assertTrue(new File(ReferenceData.HG19.getDbsnp(132)).exists());
Assert.assertNull(ReferenceData.HG19.getDbsnp(130));
}
@Test
public void testDbsnpTypes() {
Assert.assertEquals(ReferenceData.HG18.getDbsnpType(129), "DBSNP");
Assert.assertEquals(ReferenceData.HG19.getDbsnpType(129), "VCF");
Assert.assertEquals(ReferenceData.HG19.getDbsnpType(132), "VCF");
Assert.assertNull(ReferenceData.HG19.getDbsnpType(130));
}
@Test
public void testGetByReference() {
Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg18Reference), ReferenceData.HG18);
Assert.assertEquals(ReferenceData.getByReference(BaseTest.hg19Reference), ReferenceData.HG19);
Assert.assertEquals(ReferenceData.getByReference("none"), null);
}
}

View File

@ -34,11 +34,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private static final List<Integer> QUERY_SIZES = Arrays.asList(1, 10, 100, 1000);
private static final List<Integer> CACHE_SIZES = Arrays.asList(-1, 10, 1000);
@BeforeMethod
public void clearCache() {
CachingIndexedFastaSequenceFile.resetThreadLocalCache();
}
@DataProvider(name = "fastas")
public Object[][] createData1() {
List<Object[]> params = new ArrayList<Object[]>();

View File

@ -15,7 +15,7 @@
</executable>
<resources>
<!-- Supplemental scripts for graph generation, etc. -->
<file name="R/plot_residualError_OtherCovariate.R" />
<file name="R/plot_residualError_QualityScoreCovariate.R" />
<file name="public/R/plot_residualError_OtherCovariate.R" />
<file name="public/R/plot_residualError_QualityScoreCovariate.R" />
</resources>
</package>

View File

@ -40,10 +40,10 @@
</executable>
<resources>
<!-- Sample reads and reference files -->
<file name="testdata/exampleBAM.bam" />
<file name="testdata/exampleBAM.bam.bai" />
<file name="testdata/exampleFASTA.fasta" />
<file name="testdata/exampleFASTA.fasta.fai" />
<file name="testdata/exampleFASTA.dict" />
<file name="public/testdata/exampleBAM.bam" />
<file name="public/testdata/exampleBAM.bam.bai" />
<file name="public/testdata/exampleFASTA.fasta" />
<file name="public/testdata/exampleFASTA.fasta.fai" />
<file name="public/testdata/exampleFASTA.dict" />
</resources>
</package>

View File

@ -13,12 +13,14 @@
</modules>
<resources>
<!-- GATK sample code and build scripts -->
<file name="java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java" />
<file name="java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java" />
<file name="java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java" />
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java" />
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java" />
<file name="java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java" />
<file name="public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java" />
<file name="public/R/plot_Tranches.R" />
<file name="public/R/titvFPEst.R" />
</resources>
<release>
<executable directory="/humgen/gsa-hpprojects/GATK/bin" symlink="current" />

View File

@ -33,7 +33,7 @@
<package name="org.broadinstitute.sting.queue.**" />
<!-- Pipeline + Utils -->
<package name="org.broadinstitute.sting.datasources.pipeline.**" />
<package name="org.broadinstitute.sting.pipeline.**" />
<package name="org.broadinstitute.sting.utils.**" />
<!-- Scala -->

View File

@ -0,0 +1,83 @@
#!/usr/bin/perl -w
# Runs the liftover tool on a VCF and properly handles the output
use strict;
use Getopt::Long;
my $in = undef;
my $gatk = undef;
my $chain = undef;
my $newRef = undef;
my $oldRef = undef;
my $out = undef;
my $tmp = "/tmp";
my $recordOriginalLocation = 0;
GetOptions( "vcf=s" => \$in,
"gatk=s" => \$gatk,
"chain=s" => \$chain,
"newRef=s" => \$newRef,
"oldRef=s" => \$oldRef,
"out=s" => \$out,
"tmp=s" => \$tmp,
"recordOriginalLocation" => \$recordOriginalLocation);
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n\t-recordOriginalLocation \t\t<Should we record what the original location was in the INFO field?; defaults to false>\n";
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n";
exit(1);
}
# generate a random number
my $random_number = rand();
my $tmp_prefix = "$tmp/$random_number";
print "Writing temporary files to prefix: $tmp_prefix\n";
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
# lift over the file
print "Lifting over the vcf...";
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
if ($recordOriginalLocation) {
$cmd .= " -recordOriginalLocation";
}
system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying.");
# we need to sort the lifted over file now
print "\nRe-sorting the vcf...\n";
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
# write the header
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
my $inHeader = 1;
while ( $inHeader == 1 ) {
my $line = <UNSORTED>;
if ( $line !~ m/^#/ ) {
$inHeader = 0;
} else {
print SORTED "$line";
}
}
close(UNSORTED);
close(SORTED);
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/public/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf";
system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying.");
# Filter the VCF for bad records
print "\nFixing/removing bad records...\n";
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying.");
# clean up
unlink $unsorted_vcf;
unlink $sorted_vcf;
my $sorted_index = "$sorted_vcf.idx";
unlink $sorted_index;
print "\nDone!\n";
sub quit {
print "\n$_[0]\n";
exit(1);
}

View File

@ -0,0 +1,127 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
sub usage {
print "\nUsage:\n";
print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n";
print " Sorts lines of the input file INFILE according\n";
print " to the reference contig order specified by the\n";
print " reference dictionary REF_DICT (.fai file).\n";
print " The sort is stable. If -k option is not specified,\n";
print " it is assumed that the contig name is the first\n";
print " field in each line.\n\n";
print " INPUT input file to sort. If '-' is specified, \n";
print " then reads from STDIN.\n";
print " REF_DICT .fai file, or ANY file that has contigs, in the\n";
print " desired soting order, as its first column.\n";
print " --k POS : contig name is in the field POS (1-based)\n";
print " of input lines.\n\n";
print " --tmp DIR : temp directory [default=/tmp]\n\n";
exit(1);
}
my $pos = 1;
my $tmp = "/tmp";
GetOptions( "k:i" => \$pos,
"tmp=s" => \$tmp);
$pos--;
usage() if ( scalar(@ARGV) == 0 );
if ( scalar(@ARGV) != 2 ) {
print "Wrong number of arguments\n";
usage();
}
my $input_file = $ARGV[0];
my $dict_file = $ARGV[1];
open(DICT, "< $dict_file") or die("Can not open $dict_file: $!");
my %ref_order;
my $n = 0;
while ( <DICT> ) {
chomp;
my ($contig, $rest) = split "\t";
die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} );
$ref_order{$contig} = $n;
$n++;
}
close DICT;
#we have loaded contig ordering now
my $INPUT;
if ($input_file eq "-" ) {
$INPUT = "STDIN";
} else {
open($INPUT, "< $input_file") or die("Can not open $input_file: $!");
}
my %temp_outputs;
while ( <$INPUT> ) {
my @fields = split '\s';
die("Specified field position exceeds the number of fields:\n$_")
if ( $pos >= scalar(@fields) );
my $contig = $fields[$pos];
if ( $contig =~ m/:/ ) {
my @loc = split(/:/, $contig);
# print $contig . " " . $loc[0] . "\n";
$contig = $loc[0]
}
chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line
my $order;
if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; }
else {
$ref_order{$contig} = $n;
$order = $n; # input line has contig that was not in the dict;
$n++; # this contig will go at the end of the output,
# after all known contigs
}
my $fhandle;
if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} }
else {
#print "opening $order $$ $_\n";
open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or
die ( "Can not open temporary file $order: $!");
$temp_outputs{$order} = $fhandle;
}
# we got the handle to the temp file that keeps all
# lines with contig $contig
print $fhandle $_; # send current line to its corresponding temp file
}
close $INPUT;
foreach my $f ( values %temp_outputs ) { close $f; }
# now collect back into single output stream:
for ( my $i = 0 ; $i < $n ; $i++ ) {
# if we did not have any lines on contig $i, then there's
# no temp file and nothing to do
next if ( ! defined $temp_outputs{$i} ) ;
my $f;
open ( $f, "< $tmp/sortByRef.$$.$i.tmp" );
while ( <$f> ) { print ; }
close $f;
unlink "$tmp/sortByRef.$$.$i.tmp";
}

View File

@ -1,15 +1,15 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.function.ListWriterFunction
import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord}
import scala.io.Source._
import collection.JavaConversions._
import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel
import org.broadinstitute.sting.queue.extensions.picard._
import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord}
import net.sf.samtools.SAMFileHeader.SortOrder
class DataProcessingPipeline extends QScript {
@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript {
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
var useBWApe: Boolean = false
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
var bwaThreads: Int = 1
/****************************************************************************
@ -180,6 +182,7 @@ class DataProcessingPipeline extends QScript {
var realignedBams: List[File] = List()
var index = 1
for (bam <- bams) {
val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" )
val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
@ -190,11 +193,12 @@ class DataProcessingPipeline extends QScript {
bwa_sam_se(bam, saiFile1, realignedSamFile))
}
else {
add(bwa_aln_pe(bam, saiFile1, 1),
bwa_aln_pe(bam, saiFile2, 2),
bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile))
add(sortSam(bam, readSortedBam, SortOrder.queryname),
bwa_aln_pe(readSortedBam, saiFile1, 1),
bwa_aln_pe(readSortedBam, saiFile2, 2),
bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile))
}
add(sortSam(realignedSamFile, realignedBamFile))
add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
realignedBams :+= rgRealignedBamFile
index = index + 1
@ -385,9 +389,10 @@ class DataProcessingPipeline extends QScript {
this.jobName = queueLogDir + outBam + ".joinBams"
}
case class sortSam (inSam: File, outBam: File) extends SortSam {
case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam {
this.input = List(inSam)
this.output = outBam
this.sortOrder = sortOrderP
this.memoryLimit = 4
this.isIntermediate = true
this.analysisName = queueLogDir + outBam + ".sortSam"
@ -430,7 +435,7 @@ class DataProcessingPipeline extends QScript {
case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file") var sai = outSai
def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai
this.analysisName = queueLogDir + outSai + ".bwa_aln_se"
this.jobName = queueLogDir + outSai + ".bwa_aln_se"
}
@ -438,7 +443,7 @@ class DataProcessingPipeline extends QScript {
case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs {
@Input(doc="bam file to be aligned") var bam = inBam
@Output(doc="output sai file for 1st mating pair") var sai = outSai1
def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai
this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1"
this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1"
}

View File

@ -1,4 +1,4 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript

View File

@ -1,4 +1,4 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._
@ -41,9 +41,9 @@ class RecalibrateBaseQualities extends QScript {
nContigs = getNumberOfContigs(input)
val recalFile1: File = new File("recal1.csv")
val recalFile2: File = new File("recal2.csv")
val recalBam: File = swapExt(input, ".bam", "recal.bam")
val recalFile1: File = swapExt(input, ".bam", "recal1.csv")
val recalFile2: File = swapExt(input, ".bam", "recal2.csv")
val recalBam: File = swapExt(input, ".bam", "recal.bam")
val path1: String = "before"
val path2: String = "after"
@ -88,4 +88,4 @@ class RecalibrateBaseQualities extends QScript {
this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates"
this.jobName = queueLogDir + inRecalFile + ".analyze_covariates"
}
}
}

View File

@ -1,4 +1,4 @@
package core
package org.broadinstitute.sting.queue.qscripts
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk.RodBind

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk._

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.examples
import org.broadinstitute.sting.queue.QScript
class HelloWorld extends QScript {

View File

@ -1,3 +1,5 @@
package org.broadinstitute.sting.queue.qscripts.lib
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.queue.extensions.gatk.{RodBind, VariantsToTable}
import org.broadinstitute.sting.queue.QScript

View File

@ -1,211 +0,0 @@
package org.broadinstitute.sting.scala
/**
import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall}
import java.io.File
import net.sf.samtools.SAMRecord
import org.broadinstitute.sting.gatk.walkers.LocusWalker
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
import org.broadinstitute.sting.utils.Pair
import org.broadinstitute.sting.utils.genotype.GenotypeMetaData
import utils._
import cmdLine.Argument
class TransitionTable() {
var table: Array[Array[Double]] = new Array[Array[Double]](4,4)
def makeKey(b: Char): Int = {
return BaseUtils.simpleBaseToBaseIndex(b)
}
def fromKey(i: Int): Char = {
return BaseUtils.baseIndexToSimpleBase(i)
}
def add(ref: Char, subst: Char) = {
table(makeKey(ref))(makeKey(subst)) += 1
}
def get(i: Int, j: Int): Double = {
return table(i)(j)
}
def set(i: Int, j: Int, d: Double) = {
//printf("Setting %d / %d => %f%n", i, j, d)
table(i)(j) = d
}
def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = {
return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j))
}
def header(): String = {
//return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray)
return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray)
}
override def toString(): String = {
return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray)
}
def sum(): Double = {
return sumList(table.toList map (x => sumList(x.toList)))
}
def sumList(xs: List[Double]): Double = {
return (0.0 :: xs) reduceLeft {(x, y) => x + y}
}
def getRateMatrix(): TransitionTable = {
var sums = (table.toList map (x => sumList(x.toList))).toArray
var t = new TransitionTable()
//println(sums.toList)
this mapEntries ((i,j,x) => t.set(i, j, x / sums(i)))
return t
}
}
**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] {
private var MIN_MAPPING_QUALITY = 30
private var MIN_BASE_QUALITY = 20
private var MIN_LOD = 5
private var PRINT_FREQ = 1000000
private var CARE_ABOUT_STRAND = true
private val SSG = new UnifiedGenotyper()
private val table1 = new TransitionTable()
private val tableFWD = new TransitionTable()
private val tableREV = new TransitionTable()
private val table2 = new TransitionTable()
@Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false }
var VERBOSE: Boolean = false
override def initialize() {
SSG.initialize();
}
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = {
val callPair = SSG.map(tracker, ref, context)
//val call = callPair.getFirst().get(0)
val pileup = new ReadBackedPileup(ref.getBase(), context)
def hasNoNs(): Boolean = {
return ! (pileup.getBases() exists ('N' ==))
}
if ( isDefinitelyHomRef(callPair) && hasNoNs() ) {
var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases)
//printf("nonRefBases %s%n", nonRefBases)
if ( nonRefBases.length > 0 ) {
var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup)
var nonRefRead: SAMRecord = nonRefReads.head
nonRefBases.length match {
case 1 =>
//if ( goodRead(nonRefRead,offsets.head) ) {
//printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
//}
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1)
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD )
case 2 =>
addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2)
addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2)
case x =>
}
if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 )
printNonRefBases(ref, nonRefReads, offsets, context.getReads.size())
}
}
}
implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*)
def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = {
out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth)
}
def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = {
if ( goodRead(nonRefRead, offset) ) {
//printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head)
//println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality)
if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) {
// it's on the reverse strand
val refComp: Char = BaseUtils.simpleComplement(ref.getBase)
val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase)
//printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp)
table.add(refComp, nonRefComp)
} else {
table.add(ref.getBase, nonRefBase)
}
}
}
def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = {
def getBase(i: Int): Char = {
var offset = pileup.getOffsets().get(i)
return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int])
}
var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i
var reads = subset map (i => pileup.getReads().get(i))
var offsets = subset map (i => pileup.getOffsets().get(i))
//println(reads, offsets, subset)
return (reads, offsets)
}
def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = {
splitBases(refBase, pileup) match {
case (refs, nonRefs) =>
//printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString)
return nonRefs
}
}
def goodRead( read: SAMRecord, offset: Integer ): Boolean = {
return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY
}
def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = {
val refBases = pileup.toList filter (refBase ==)
val nonRefBases = pileup.toList filter (refBase !=)
return (refBases, nonRefBases)
}
def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = {
if ( call == null )
return false;
return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD
}
def reduceInit(): Int = {
return 0;
}
def reduce(m: Unit, r: Int): Int = {
return r;
}
override def onTraversalDone(r: Int) = {
out.println("-------------------- FINAL RESULT --------------------")
out.println("type\t" + table1.header)
def print1(t: String, x: TransitionTable) = {
out.println(t + "\t" + x)
}
print1("1-mismatch", table1)
print1("2-mismatch", table2)
print1("1-mismatch-fwd", tableFWD)
print1("1-mismatch-rev", tableREV)
}
}
*/

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.scala
import java.io.PrintStream
import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext}
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import collection.JavaConversions._
import collection.immutable.List
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature
import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker}
import org.broadinstitute.sting.commandline.{Output, Argument}
import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc}
import collection.mutable.{ListBuffer, HashSet}
import java.lang.Math
class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] {
@Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb")
var minPropOverlap : Double = 0.50 // default to 50%
@Output var out : PrintStream = _
override def reduceInit : List[IntervalInfoBuilder] = {
Nil
}
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = {
val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList
val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList
val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList
var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase)
if ( refSeqData != null ) {
amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature])
}
if ( tableMetaData != null ) {
amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) =>
new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _)))
}
amd.newIvals = ivalsOfInterest.map(_.getLocation)
return amd
}
override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = {
rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done)))
val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) :::
mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) )
newList.foreach(_.update(mapMetaData))
return newList
}
override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = {
finalList.foreach(u => out.print("%s%n".format(u.done)))
}
}
class AnnotationMetaData(loc: GenomeLoc, ref: Byte) {
val locus : GenomeLoc = loc
var newIvals : List[GenomeLoc] = Nil
var refSeqFeatures : List[RefSeqFeature] = Nil
var tableFeatures : List[(GenomeLoc,String)] = Nil
val refBase : Byte = ref
}
class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) {
val OVERLAP_PROP : Double = minProp
var metaData : HashSet[String] = new HashSet[String]
var seenTranscripts : HashSet[String] = new HashSet[String]
var baseContent : ListBuffer[Byte] = new ListBuffer[Byte]
var location : GenomeLoc = loc
var gcContent : Double = _
var entropy : Double = _
var finalized : Boolean = false
def update(mmd : AnnotationMetaData) = {
baseContent += mmd.refBase
mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u))
mmd.tableFeatures.filter( (t : (GenomeLoc,String)) =>
! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2))
}
def addRefSeq( rs: RefSeqFeature ) : Unit = {
seenTranscripts.add(rs.getTranscriptUniqueGeneName)
val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp )
if ( exons.size > 0 ) {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _)))
} else {
if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR"))
} else {
metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron"))
}
}
}
def done : String = {
finalized = true
def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 }
gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size()
entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b)))
val meta : String = metaData.reduceLeft(_ + "\t" + _)
return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta)
}
def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = {
if(byteList.size == 1) return 0
Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) +
calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => {
if ( b.equals(byteList(1)) ) {
a.dropRight(1) :+ (a.last ++ b)
} else {
a :+ b
}
}))
}
}

View File

@ -1,24 +0,0 @@
package org.broadinstitute.sting.scala
import org.broadinstitute.sting.gatk.walkers.LocusWalker
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker
import org.broadinstitute.sting.gatk.contexts.ReferenceContext
import org.broadinstitute.sting.gatk.contexts.AlignmentContext
class ScalaCountLoci extends LocusWalker[Int,Int] {
override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = {
return 1
}
def reduceInit(): Int = {
return 0;
}
def reduce(m: Int, r: Int): Int = {
return m + r;
}
def main(args: Array[String]) {
println("Hello, world!")
}
}

View File

@ -1,71 +0,0 @@
package org.broadinstitute.sting.queue.pipeline
import org.testng.annotations.Test
import org.broadinstitute.sting.BaseTest
class IPFLibraryPipelineTest {
@Test
def testVCFExtractSites {
var testOut = "vcfes.vcf"
val spec = new PipelineTestSpec
spec.name = "vcfExtractSites"
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSites -ivcf %s -out %s".format(
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
)
spec.fileMD5s += testOut -> "4f496b8cf90302428a9edda486a337f4"
PipelineTest.executeTest(spec)
}
@Test
def testVCFExtractSamples {
var testOut = "vcf.extract.samples.vcf"
val spec = new PipelineTestSpec
spec.name = "vcfExtractSamples"
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractSamples -ivcf %s -out %s -sm HG00107,HG00500,NA18501,NA18942".format(
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
)
spec.fileMD5s += testOut -> "180d5a2e7a1fbc5117de6705bde3a7c8"
}
@Test
def testVCFExtractIntervals {
var testOut = "vcf.extract.intervals.list"
val spec = new PipelineTestSpec
spec.name = "vcfExtractIntervals"
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFExtractIntervals -ivcf %s -out %s".format(
BaseTest.validationDataLocation + "omni_1212.subset.b37.vcf", testOut
)
spec.fileMD5s += testOut ->"28589eeb28fac753577c027abf939345"
}
@Test
def testVCFSimpleMerge {
var testOut = "vcf.simplemerge.vcf"
val spec = new PipelineTestSpec
val int1 = BaseTest.validationDataLocation + "omni.subset.interleaved.1.vcf"
val int2 = BaseTest.validationDataLocation + "omni.subset.interleaved.2.vcf"
spec.name = "vcfSimpleMerge"
spec.args = "-S scala/qscript/oneoffs/QTools.q -T VCFSimpleMerge -vcfs %s,%s -out %s -ref %s".format(
int1,int2,testOut,BaseTest.b37KGReference
)
spec.fileMD5s += testOut -> "c59b8de64ba787a2ca3a1734bdebf277"
}
@Test
def testSortByRef {
var testOut = "vcf.refsorted.vcf"
val spec = new PipelineTestSpec
val unsorted = BaseTest.validationDataLocation + "omni.pos_sorted.vcf"
spec.name = "sortByRef"
spec.args = "-S scala/qscript/oneoffs/QTools.q -T SortByRef -ivcf %s -out %s -ref %s".format(
unsorted, testOut, BaseTest.b37KGReference
)
spec.fileMD5s += testOut -> "ee09af803bc94987d55d044c2ebbc0b8"
}
}

View File

@ -32,8 +32,6 @@ import java.util.Date
import java.text.SimpleDateFormat
import org.broadinstitute.sting.BaseTest
import org.broadinstitute.sting.queue.QCommandLine
import org.broadinstitute.sting.datasources.pipeline.{Pipeline, PipelineProject, PipelineSample}
import org.broadinstitute.sting.utils.broad.PicardAggregationUtils
import org.broadinstitute.sting.queue.util.{Logging, ProcessController}
import java.io.{FileNotFoundException, File}
import org.broadinstitute.sting.gatk.report.GATKReportParser
@ -42,23 +40,6 @@ import org.broadinstitute.sting.queue.engine.CommandLinePluginManager
object PipelineTest extends BaseTest with Logging {
case class K1gBam(squidId: String, sampleId: String, version: Int)
/** 1000G BAMs used for validation */
val k1gBams = List(
new K1gBam("C474", "NA19651", 2),
new K1gBam("C474", "NA19655", 2),
new K1gBam("C474", "NA19669", 2),
new K1gBam("C454", "NA19834", 2),
new K1gBam("C460", "HG01440", 2),
new K1gBam("C456", "NA12342", 2),
new K1gBam("C456", "NA12748", 2),
new K1gBam("C474", "NA19649", 2),
new K1gBam("C474", "NA19652", 2),
new K1gBam("C474", "NA19654", 2))
validateK1gBams()
private val validationReportsDataLocation = "/humgen/gsa-hpprojects/GATK/validationreports/submitted/"
val run = System.getProperty("pipeline.run") == "run"
@ -92,49 +73,6 @@ object PipelineTest extends BaseTest with Logging {
*/
private def tempDir(testName: String, jobRunner: String) = testDir(testName, jobRunner) + "temp/"
/**
* Creates a new pipeline from a project.
* @param project Pipeline project info.
* @param samples List of samples.
* @return a new pipeline project.
*/
def createPipeline(project: PipelineProject, samples: List[PipelineSample]) = {
val pipeline = new Pipeline
pipeline.setProject(project)
pipeline.setSamples(samples)
pipeline
}
/**
* Creates a new pipeline project for hg19 with b37 132 dbsnp for genotyping, and b37 129 dbsnp for eval.
* @param projectName Name of the project.
* @param intervals The intervals file to use.
* @return a new pipeline project.
*/
def createHg19Project(projectName: String, intervals: String) = {
val project = new PipelineProject
project.setName(projectName)
project.setReferenceFile(new File(BaseTest.hg19Reference))
project.setGenotypeDbsnp(new File(BaseTest.b37dbSNP132))
project.setEvalDbsnp(new File(BaseTest.b37dbSNP129))
project.setRefseqTable(new File(BaseTest.hg19Refseq))
project.setIntervalList(new File(intervals))
project
}
/**
* Creates a 1000G pipeline sample from one of the bams.
* @param idPrefix Text to prepend to the sample name.
* @param k1gBam bam to create the sample for.
* @return the created pipeline sample.
*/
def createK1gSample(idPrefix: String, k1gBam: K1gBam) = {
val sample = new PipelineSample
sample.setId(idPrefix + "_" + k1gBam.sampleId)
sample.setBamFiles(Map("cleaned" -> getPicardBam(k1gBam)))
sample
}
/**
* Runs the pipelineTest.
* @param pipelineTest test to run.
@ -267,31 +205,6 @@ object PipelineTest extends BaseTest with Logging {
}
}
/**
* Throws an exception if any of the 1000G bams do not exist and warns if they are out of date.
*/
private def validateK1gBams() {
var missingBams = List.empty[File]
for (k1gBam <- k1gBams) {
val latest = getLatestVersion(k1gBam)
val bam = getPicardBam(k1gBam)
if (k1gBam.version != latest)
logger.warn("1000G bam is not the latest version %d: %s".format(latest, k1gBam))
if (!bam.exists)
missingBams :+= bam
}
if (missingBams.size > 0) {
val nl = "%n".format()
throw new FileNotFoundException("The following 1000G bam files are missing.%n%s".format(missingBams.mkString(nl)))
}
}
private def getPicardBam(k1gBam: K1gBam): File =
new File(PicardAggregationUtils.getSampleBam(k1gBam.squidId, k1gBam.sampleId, k1gBam.version))
private def getLatestVersion(k1gBam: K1gBam): Int =
PicardAggregationUtils.getLatestVersion(k1gBam.squidId, k1gBam.sampleId, k1gBam.version)
private var runningCommandLines = Set.empty[QCommandLine]
Runtime.getRuntime.addShutdownHook(new Thread {

View File

@ -35,7 +35,7 @@ class ExampleCountLociPipelineTest {
val spec = new PipelineTestSpec
spec.name = "countloci"
spec.args = Array(
" -S scala/qscript/examples/ExampleCountLoci.scala",
" -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala",
" -R " + BaseTest.hg18Reference,
" -I " + BaseTest.validationDataLocation + "small_bam_for_countloci.bam",
" -o " + testOut).mkString

View File

@ -32,7 +32,7 @@ class HelloWorldPipelineTest {
def testHelloWorld {
val spec = new PipelineTestSpec
spec.name = "HelloWorld"
spec.args = "-S scala/qscript/examples/HelloWorld.scala"
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala"
PipelineTest.executeTest(spec)
}
@ -40,7 +40,7 @@ class HelloWorldPipelineTest {
def testHelloWorldWithPrefix {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithPrefix"
spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPrefix HelloWorld"
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld"
PipelineTest.executeTest(spec)
}
@ -48,7 +48,7 @@ class HelloWorldPipelineTest {
def testHelloWorldWithPriority {
val spec = new PipelineTestSpec
spec.name = "HelloWorldWithPriority"
spec.args = "-S scala/qscript/examples/HelloWorld.scala -jobPriority 100"
spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100"
PipelineTest.executeTest(spec)
}
}