diff --git a/build.xml b/build.xml
index d3e25d424..9a66d4699 100644
--- a/build.xml
+++ b/build.xml
@@ -955,8 +955,8 @@
-
-
+
+
diff --git a/LICENSE b/licensing/GATK1_LICENSE
similarity index 96%
rename from LICENSE
rename to licensing/GATK1_LICENSE
index 634096e2b..648ec8fc3 100644
--- a/LICENSE
+++ b/licensing/GATK1_LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2011 The Broad Institute
+Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
diff --git a/licensing/GATK2_beta_license.doc b/licensing/GATK2_beta_license.doc
new file mode 100644
index 000000000..4fa04a3f6
Binary files /dev/null and b/licensing/GATK2_beta_license.doc differ
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
index 876cf5cbc..4c228ccb4 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
@@ -2,19 +2,19 @@
.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
colnames(d) = tableHeader;
-
+
for (i in 1:ncol(d)) {
# use the general type.convert infrastructure of read.table to convert column data to R types
v = type.convert(d[,i])
d[,i] = v;
}
-
+
usedNames = ls(envir=tableEnv, pattern=tableName);
-
+
if (length(usedNames) > 0) {
tableName = paste(tableName, ".", length(usedNames), sep="");
}
-
+
assign(tableName, d, envir=tableEnv);
}
@@ -28,74 +28,163 @@
starts = c(1, columnStarts);
stops = c(columnStarts - 1, nchar(line));
-
+
sapply(line, splitStartStop)[,1];
}
+# Old implementaton for v0.*
+gsa.read.gatkreportv0 <- function(lines) {
+
+ tableEnv = new.env();
+
+ tableName = NA;
+ tableHeader = c();
+ tableRows = c();
+ version = NA;
+
+ for (line in lines) {
+ if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
+ headerFields = unlist(strsplit(line, "[[:space:]]+"));
+
+ if (!is.na(tableName)) {
+ .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+ }
+
+ tableName = headerFields[2];
+ tableHeader = c();
+ tableRows = c();
+
+ # For differences in versions see
+ # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
+ if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+ version = "v0.1";
+
+ } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+ version = "v0.2";
+ columnStarts = c();
+
+ }
+
+ } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
+ # do nothing
+ } else if (!is.na(tableName)) {
+
+ if (version == "v0.1") {
+ row = unlist(strsplit(line, "[[:space:]]+"));
+
+ } else if (version == "v0.2") {
+ if (length(tableHeader) == 0) {
+ headerChars = unlist(strsplit(line, ""));
+ # Find the first position of non space characters, excluding the first character
+ columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
+ }
+
+ row = .gsa.splitFixedWidth(line, columnStarts);
+ }
+
+ if (length(tableHeader) == 0) {
+ tableHeader = row;
+ } else {
+ tableRows = rbind(tableRows, row);
+ }
+ }
+ }
+
+ if (!is.na(tableName)) {
+ .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+ }
+
+ gatkreport = as.list(tableEnv, all.names=TRUE);
+}
+
+# Load all GATKReport v1 tables from file
+gsa.read.gatkreportv1 <- function(lines) {
+ #print("loading with optimized v1 reader")
+ nLines = length(lines)
+ tableEnv = new.env();
+
+ tableName = NA;
+ tableHeader = c();
+ tableRows = NULL;
+ version = "";
+ rowCount = 0
+ headerRowCount = -1;
+
+ finishTable <- function() {
+ .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
+ }
+
+ for (line in lines) {
+
+ if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
+ version = "v1.0";
+ headerRowCount = 0;
+ }
+
+ if ( (headerRowCount %% 2 == 1) && (version == "v1.0") ) {
+ #print("Trying to start a table with line:");
+ #print(line);
+
+ #Get table header
+ headerFields = unlist(strsplit(line, ":"));
+
+ if (!is.na(tableName)) {
+ finishTable()
+ }
+
+ tableName = headerFields[3];
+ tableHeader = c();
+ tableRows = NULL
+ rowCount = 0
+
+ columnStarts = c();
+ }
+
+ if (length(grep("^#:GATKTable", line, ignore.case=TRUE)) > 0) {
+ headerRowCount = headerRowCount+1;
+ #print("Header Row count is at:")
+ #print(headerRowCount);
+ } else if (!is.na(tableName)) {
+ if ( version == "v1.0") {
+ if (length(tableHeader) == 0) {
+ headerChars = unlist(strsplit(line, ""));
+ # Find the first position of non space characters, excluding the first character
+ columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
+ tableRows = matrix(nrow=nLines, ncol=length(columnStarts)+1);
+ }
+
+ row = .gsa.splitFixedWidth(line, columnStarts);
+ }
+
+ if (length(tableHeader) == 0) {
+ tableHeader = row;
+ } else if ( nchar(line) > 0 ) {
+ rowCount = rowCount + 1
+ tableRows[rowCount,] <- row
+ }
+ }
+ }
+
+ if (!is.na(tableName)) {
+ finishTable()
+ }
+
+ gatkreport = as.list(tableEnv, all.names=TRUE);
+}
+
# Load all GATKReport tables from a file
gsa.read.gatkreport <- function(filename) {
con = file(filename, "r", blocking = TRUE);
lines = readLines(con);
close(con);
-
- tableEnv = new.env();
-
- tableName = NA;
- tableHeader = c();
- tableRows = c();
- version = NA;
-
- for (line in lines) {
- if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
- headerFields = unlist(strsplit(line, "[[:space:]]+"));
-
- if (!is.na(tableName)) {
- .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
- }
-
- tableName = headerFields[2];
- tableHeader = c();
- tableRows = c();
-
- # For differences in versions see
- # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
- if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
- version = "v0.1";
-
- } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
- version = "v0.2";
- columnStarts = c();
-
- }
-
- } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
- # do nothing
- } else if (!is.na(tableName)) {
-
- if (version == "v0.1") {
- row = unlist(strsplit(line, "[[:space:]]+"));
-
- } else if (version == "v0.2") {
- if (length(tableHeader) == 0) {
- headerChars = unlist(strsplit(line, ""));
- # Find the first position of non space characters, excluding the first character
- columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
- }
-
- row = .gsa.splitFixedWidth(line, columnStarts);
- }
-
- if (length(tableHeader) == 0) {
- tableHeader = row;
- } else {
- tableRows = rbind(tableRows, row);
- }
- }
+
+ # get first line
+ line = lines[1];
+
+ if (length(grep("^#:GATKReport.v1", line, ignore.case=TRUE)) > 0) {
+ gsa.read.gatkreportv1(lines)
}
-
- if (!is.na(tableName)) {
- .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+ else if (length(grep("^##:GATKReport.v0", line, ignore.case=TRUE)) > 0) {
+ gsa.read.gatkreportv0(lines)
}
-
- gatkreport = as.list(tableEnv, all.names=TRUE);
}
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
new file mode 100644
index 000000000..19567e7e6
--- /dev/null
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
@@ -0,0 +1,244 @@
+library(gplots)
+library(ggplot2)
+
+# -------------------------------------------------------
+# Utilities for displaying multiple plots per page
+# -------------------------------------------------------
+
+distributeGraphRows <- function(graphs, heights = c()) {
+ # Viewport layout 2 graphs top to bottom with given relative heights
+ #
+ #
+ if (length(heights) == 0) {
+ heights <- rep.int(1, length(graphs))
+ }
+ heights <- heights[!is.na(graphs)]
+ graphs <- graphs[!is.na(graphs)]
+ numGraphs <- length(graphs)
+ Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
+ grid.newpage()
+ pushViewport(viewport(layout = Layout))
+ subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
+ for (i in 1:numGraphs) {
+ print(graphs[[i]], vp = subplot(i))
+ }
+}
+
+distributeLogGraph <- function(graph, xName) {
+ continuousGraph <- graph + scale_x_continuous(xName)
+ logGraph <- graph + scale_x_log10(xName) + opts(title="")
+ distributeGraphRows(list(continuousGraph, logGraph))
+}
+
+distributePerSampleGraph <- function(perSampleGraph, distGraph, ratio=c(2,1)) {
+ distributeGraphRows(list(perSampleGraph, distGraph), ratio)
+}
+
+removeExtraStrats <- function(variantEvalDataFrame, moreToRemove=c()) {
+ # Remove the standard extra stratification columns FunctionalClass, Novelty, and others in moreToRemove from the variantEvalDataFrame
+ #
+ # Only keeps the column marked with "all" for each removed column
+ #
+ for ( toRemove in c("FunctionalClass", "Novelty", moreToRemove) ) {
+ if (toRemove %in% colnames(variantEvalDataFrame)) {
+ variantEvalDataFrame <- variantEvalDataFrame[variantEvalDataFrame[[toRemove]] == "all",]
+ }
+ }
+ variantEvalDataFrame
+}
+
+openPDF <- function(outputPDF) {
+ # Open the outputPDF file with standard dimensions, if outputPDF is not NA
+ if ( ! is.na(outputPDF) ) {
+ pdf(outputPDF, height=8.5, width=11)
+ }
+}
+
+closePDF <- function(outputPDF) {
+ # close the outputPDF file if not NA, and try to compact the PDF if possible
+ if ( ! is.na(outputPDF) ) {
+ dev.off()
+ if (exists("compactPDF")) {
+ compactPDF(outputPDF)
+ }
+ }
+}
+
+makeRatioDataFrame <- function(ACs, num, denom, widths = NULL) {
+ if ( is.null(widths) ) widths <- rep(1, length(ACs))
+
+ value = NULL
+ titv <- data.frame(AC=ACs, width = widths, num=num, denom = denom, ratio = num / denom)
+}
+
+.reduceACs <- function(binWidthForAC, ACs) {
+ # computes data structures necessary to reduce the full range of ACs
+ #
+ # binWidthForAC returns the number of upcoming bins that should be merged into
+ # that AC bin. ACs is a vector of all AC values from 0 to 2N that should be
+ # merged together
+ #
+ # Returns a list containing the reduced ACs starts, their corresponding widths,
+ # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
+ maxAC <- max(ACs)
+ newACs <- c()
+ widths <- c()
+ newACMap <- c()
+ ac <- 0
+ while ( ac < maxAC ) {
+ newACs <- c(newACs, ac)
+ width <- binWidthForAC(ac)
+ widths <- c(widths, width)
+ newACMap <- c(newACMap, rep(ac, width))
+ ac <- ac + width
+ }
+ list(ACs = newACs, widths=widths, newACMap = newACMap)
+}
+
+# geometricACs <- function(k, ACs) {
+# nBins <- round(k * log10(max(ACs)))
+#
+# binWidthForAC <- function(ac) {
+# max(ceiling(ac / nBins), 1)
+# }
+#
+# return(reduceACs(binWidthForAC, ACs))
+# }
+
+reduce.AC.on.LogLinear.intervals <- function(scaleFactor, ACs) {
+ # map the full range of AC values onto a log linear scale
+ #
+ # Reduce the full AC range onto one where the width of each new AC increases at a rate of
+ # 10^scaleFactor in size with growing AC values. This is primarily useful for accurately
+ # computing ratios or other quantities by AC that aren't well determined when the AC
+ # values are very large
+ #
+ # Returns a list containing the reduced ACs starts, their corresponding widths,
+ # and a map from original ACs to their new ones (1 -> 1, 2 -> 2, 3 -> 2, etc)
+ maxAC <- max(ACs)
+ afs <- ACs / maxAC
+ breaks <- 10^(seq(-4, -1, scaleFactor))
+ widths <- c()
+ lastBreak <- 1
+ for ( i in length(breaks):1 ) {
+ b <- breaks[i]
+ width <- sum(afs < lastBreak & afs >= b)
+ widths <- c(widths, width)
+ lastBreak <- b
+ }
+ widths <- rev(widths)
+
+ binWidthForAC <- function(ac) {
+ af <- ac / maxAC
+ value = 1
+ for ( i in length(breaks):1 )
+ if ( af >= breaks[i] ) {
+ value = widths[i]
+ break
+ }
+
+ return(value)
+ }
+
+ return(.reduceACs(binWidthForAC, ACs))
+}
+
+.remapACs <- function(remapper, k, df) {
+ newACs <- remapper(k, df$AC)
+
+ n = length(newACs$ACs)
+ num = rep(0, n)
+ denom = rep(0, n)
+ for ( i in 1:dim(df)[1] ) {
+ rowI = df$AC == i
+ row = df[rowI,]
+ newAC = newACs$newACMap[row$AC]
+ newRowI = newACs$ACs == newAC
+ num[newRowI] = num[newRowI] + df$num[rowI]
+ denom[newRowI] = denom[newRowI] + df$denom[rowI]
+ }
+
+ newdf <- makeRatioDataFrame(newACs$ACs, num, denom, newACs$widths )
+ newdf
+}
+
+compute.ratio.on.LogLinear.AC.intervals <- function(ACs, num, denom, scaleFactor = 0.1) {
+ df = makeRatioDataFrame(ACs, num, denom, 1)
+ return(.remapACs(reduce.AC.on.LogLinear.intervals, scaleFactor, df))
+}
+
+plotVariantQC <- function(metrics, measures, requestedStrat = "Sample",
+ fixHistogramX=F, anotherStrat = NULL, nObsField = "n_indels",
+ onSamePage=F, facetVariableOnXPerSample = F, facetVariableOnXForDist = T,
+ moreTitle="", note = NULL) {
+ metrics$strat = metrics[[requestedStrat]]
+
+ otherFacet = "."
+ id.vars = c("strat", "nobs")
+ metrics$nobs <- metrics[[nObsField]]
+
+ # keep track of the other strat and it's implied facet value
+ if (! is.null(anotherStrat)) {
+ id.vars = c(id.vars, anotherStrat)
+ otherFacet = anotherStrat
+ }
+
+ molten <- melt(metrics, id.vars=id.vars, measure.vars=c(measures))
+ perSampleGraph <- ggplot(data=molten, aes(x=strat, y=value, group=variable, color=variable, fill=variable))
+
+ # create the title
+ titleText=paste(paste(paste(measures, collapse=", "), "by", requestedStrat), moreTitle)
+ if ( !is.null(note) ) {
+ titleText=paste(titleText, note, sep="\n")
+ }
+ paste(titleText)
+ title <- opts(title=titleText)
+
+ determineFacet <- function(onX) {
+ if ( onX ) {
+ paste(otherFacet, "~ variable")
+ } else {
+ paste("variable ~", otherFacet)
+ }
+ }
+
+ sampleFacet = determineFacet(facetVariableOnXPerSample)
+ distFacet = determineFacet(facetVariableOnXForDist)
+
+ if ( requestedStrat == "Sample" ) {
+ perSampleGraph <- perSampleGraph + geom_text(aes(label=strat), size=1.5) + geom_blank() # don't display a scale
+ perSampleGraph <- perSampleGraph + scale_x_discrete("Sample (ordered by nSNPs)", formatter=function(x) "")
+ } else { # by AlleleCount
+ perSampleGraph <- perSampleGraph + geom_point(aes(size=log10(nobs))) #+ geom_smooth(aes(weight=log10(nobs)))
+ perSampleGraph <- perSampleGraph + scale_x_log10("AlleleCount")
+ }
+ perSampleGraph <- perSampleGraph + ylab("Variable value") + title
+ perSampleGraph <- perSampleGraph + facet_grid(sampleFacet, scales="free")
+
+ nValues = length(unique(molten$value))
+ if (nValues > 2) {
+ if ( requestedStrat == "Sample" ) {
+ distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable))
+ } else {
+ distGraph <- ggplot(data=molten, aes(x=value, group=variable, fill=variable, weight=nobs))
+ }
+ distGraph <- distGraph + geom_histogram(aes(y=..ndensity..))
+ distGraph <- distGraph + geom_density(alpha=0.5, aes(y=..scaled..))
+ distGraph <- distGraph + geom_rug(aes(y=NULL, color=variable, position="jitter"))
+ scale = "free"
+ if ( fixHistogramX ) scale = "fixed"
+ distGraph <- distGraph + facet_grid(distFacet, scales=scale)
+ distGraph <- distGraph + ylab("Relative frequency")
+ distGraph <- distGraph + xlab("Variable value (see facet for variable by color)")
+ distGraph <- distGraph + opts(axis.text.x=theme_text(angle=-45)) # , legend.position="none")
+ } else {
+ distGraph <- NA
+ }
+
+ if ( onSamePage ) {
+ suppressMessages(distributePerSampleGraph(perSampleGraph, distGraph))
+ } else {
+ suppressMessages(print(perSampleGraph))
+ suppressMessages(print(distGraph + title))
+ }
+}
diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R
deleted file mode 100755
index 7af5e8bbb..000000000
--- a/public/R/titvFPEst.R
+++ /dev/null
@@ -1,138 +0,0 @@
-titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) }
-
-titvFPEstV <- function(titvExpected, titvs) {
- sapply(titvs, function(x) titvFPEst(titvExpected, x))
-}
-
-calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) {
- TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel
- 2 * TP / 3 / callable
-}
-
-marginalTiTv <- function( nx, titvx, ny, titvy ) {
- tvx = nx / (titvx + 1)
- tix = nx - tvx
- tvy = ny / (titvy + 1)
- tiy = ny - tvy
- tiz = tix - tiy
- tvz = tvx - tvy
- return(tiz / tvz)
-}
-marginaldbSNPRate <- function( nx, dbx, ny, dby ) {
- knownx = nx * dbx / 100
- novelx = nx - knownx
- knowny = ny * dby / 100
- novely = ny - knowny
- knownz = knownx - knowny
- novelz = novelx - novely
- return(knownz / ( knownz + novelz ) * 100)
-}
-
-numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) {
- nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals))
- return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls))
-}
-
-normalize <- function(x) {
- x / sum(x)
-}
-
-normcumsum <- function(x) {
- cumsum(normalize(x))
-}
-
-cumhist <- function(d, ...) {
- plot(d[order(d)], type="b", col="orange", lwd=2, ...)
-}
-
-revcumsum <- function(x) {
- return(rev(cumsum(rev(x))))
-}
-
-phred <- function(x) {
- log10(max(x,10^(-9.9)))*-10
-}
-
-pOfB <- function(b, B, Q) {
- #print(paste(b, B, Q))
- p = 1 - 10^(-Q/10)
- if ( b == B )
- return(p)
- else
- return(1 - p)
-}
-
-pOfG <- function(bs, qs, G) {
- a1 = G[1]
- a2 = G[2]
-
- log10p = 0
- for ( i in 1:length(bs) ) {
- b = bs[i]
- q = qs[i]
- p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2
- log10p = log10p + log10(p1)
- }
-
- return(log10p)
-}
-
-pOfGs <- function(nAs, nBs, Q) {
- bs = c(rep("a", nAs), rep("t", nBs))
- qs = rep(Q, nAs + nBs)
- G1 = c("a", "a")
- G2 = c("a", "t")
- G3 = c("t", "t")
-
- log10p1 = pOfG(bs, qs, G1)
- log10p2 = pOfG(bs, qs, G2)
- log10p3 = pOfG(bs, qs, G3)
- Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3))))
-
- return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample))
-}
-
-QsampleExpected <- function(depth, Q) {
- weightedAvg = 0
- for ( d in 1:(depth*3) ) {
- Qsample = 0
- pOfD = dpois(d, depth)
- for ( nBs in 0:d ) {
- pOfnB = dbinom(nBs, d, 0.5)
- nAs = d - nBs
- Qsample = pOfGs(nAs, nBs, Q)$Qsample
- #Qsample = 1
- weightedAvg = weightedAvg + Qsample * pOfD * pOfnB
- print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg)))
- }
- }
-
- return(weightedAvg)
-}
-
-plotQsamples <- function(depths, Qs, Qmax) {
- cols = rainbow(length(Qs))
- plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling")
-
- for ( i in 1:length(Qs) ) {
- Q = Qs[i]
- y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q)))
- points(depths, y, col=cols[i], type="b")
- }
-
- legend("topleft", paste("Q", Qs), fill=cols)
-}
-
-pCallHetGivenDepth <- function(depth, nallelesToCall) {
- depths = 0:(2*depth)
- pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5)))
- dpois(depths,depth)*(1-pNoAllelesToCall)
-}
-
-pCallHets <- function(depth, nallelesToCall) {
- sum(pCallHetGivenDepth(depth,nallelesToCall))
-}
-
-pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) {
- 1-(1-pCallHets(depth,nallelesToCall))^nsamples
-}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
index 0c6096e0c..d1d616c97 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java
@@ -83,12 +83,12 @@ public final class IntervalBinding {
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
- FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
+ final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
if ( codec instanceof ReferenceDependentFeatureCodec )
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
try {
- FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
- AsciiLineReader lineReader = new AsciiLineReader(fis);
+ final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
+ final AsciiLineReader lineReader = new AsciiLineReader(fis);
codec.readHeader(lineReader);
String line = lineReader.readLine();
while ( line != null ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index e5aaf2338..c6bb4a27c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -103,21 +103,6 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
argumentSources.add(walker);
Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
-
- // todo: remove me when the old style system is removed
- if ( getArgumentCollection().RODBindings.size() > 0 ) {
- logger.warn("################################################################################");
- logger.warn("################################################################################");
- logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
- logger.warn("Please use arguments defined by each specific walker instead.");
- for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
- logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
- }
- logger.warn("################################################################################");
- logger.warn("################################################################################");
- System.exit(1);
- }
-
engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 9c59ffe9a..70c6bc734 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -100,10 +100,11 @@ public class CommandLineGATK extends CommandLineExecutable {
} catch(PicardException e) {
// TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions?
exitSystemWithError(e);
- }
- catch (SAMException e) {
+ } catch (SAMException e) {
checkForTooManyOpenFilesProblem(e.getMessage());
exitSystemWithSamError(e);
+ } catch (OutOfMemoryError e) {
+ exitSystemWithUserError(new UserException.NotEnoughMemory());
} catch (Throwable t) {
checkForTooManyOpenFilesProblem(t.getMessage());
exitSystemWithError(t);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index 50ef4653b..039ca565a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
-import net.sf.samtools.*;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
@@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
-import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
+import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
@@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
- public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
+ public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
/**
* Actually run the GATK with the specified walker.
@@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
- setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
+ setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();
@@ -356,10 +358,6 @@ public class GenomeAnalysisEngine {
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
- protected boolean generateExtendedEvents() {
- return walker.generateExtendedEvents();
- }
-
protected boolean includeReadsWithDeletionAtLoci() {
return walker.includeReadsWithDeletionAtLoci();
}
@@ -613,7 +611,7 @@ public class GenomeAnalysisEngine {
*/
protected GenomeLocSortedSet loadIntervals( List> argList, IntervalSetRule rule ) {
- List allIntervals = new ArrayList(0);
+ List allIntervals = new ArrayList();
for ( IntervalBinding intervalBinding : argList ) {
List intervals = intervalBinding.getIntervals(this);
@@ -766,7 +764,6 @@ public class GenomeAnalysisEngine {
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters,
includeReadsWithDeletionAtLoci(),
- generateExtendedEvents(),
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
index db22886ce..dc77df071 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
@@ -36,7 +36,6 @@ public class ReadProperties {
private final Collection supplementalFilters;
private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities;
- private final boolean generateExtendedEvents;
private final BAQ.CalculationMode cmode;
private final BAQ.QualityMode qmode;
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
@@ -52,16 +51,9 @@ public class ReadProperties {
return includeReadsWithDeletionAtLoci;
}
- /**
- * Return true if the walker wants to see additional piles of "extended" events (indels). An indel is associated,
- * by convention, with the reference base immediately preceding the insertion/deletion, and if this flag is set
- * to 'true', any locus with an indel associated with it will cause exactly two subsequent calls to walker's map(): first call
- * will be made with a "conventional" base pileup, the next call will be made with a pileup of extended (indel/noevent)
- * events.
- * @return
- */
+ @Deprecated
public boolean generateExtendedEvents() {
- return generateExtendedEvents;
+ return false;
}
/**
@@ -144,9 +136,6 @@ public class ReadProperties {
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
- * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
- * a pile of indel/noevent extended events at every locus with at least one indel associated with it
- * (in addition to a "regular" call to map() at this locus performed with base pileup)
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
@@ -163,7 +152,6 @@ public class ReadProperties {
ValidationExclusion exclusionList,
Collection supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
- boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
@@ -176,7 +164,6 @@ public class ReadProperties {
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
this.supplementalFilters = supplementalFilters;
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
- this.generateExtendedEvents = generateExtendedEvents;
this.useOriginalBaseQualities = useOriginalBaseQualities;
this.cmode = cmode;
this.qmode = qmode;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 02d211a0c..3a1408d59 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -107,11 +107,6 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
- @Deprecated
- @Hidden
- @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false)
- public ArrayList RODBindings = new ArrayList();
-
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@@ -198,6 +193,16 @@ public class GATKArgumentCollection {
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
+ /**
+ * Turns on the base quantization module. It requires a recalibration report (-BQSR).
+ *
+ * A value of 0 here means "do not quantize".
+ * Any value greater than zero will be used to recalculate the quantization using this many levels.
+ * Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
+ */
+ @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
+ public int quantizationLevels = -1;
+
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
index 57416d111..9a847d38e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java
@@ -98,6 +98,7 @@ public class AlignmentContext implements HasGenomeLocation {
* only base pileup.
* @return
*/
+ @Deprecated
public ReadBackedExtendedEventPileup getExtendedEventPileup() {
if(!hasExtendedEventPileup())
throw new ReviewedStingException("No extended event pileup is present.");
@@ -115,6 +116,7 @@ public class AlignmentContext implements HasGenomeLocation {
*
* @return
*/
+ @Deprecated
public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; }
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
index 376064cdb..1290319e2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
@@ -191,6 +191,16 @@ public class ReferenceContext {
return basesCache;
}
+ /**
+ * All the bases in the window from the current base forward to the end of the window.
+ */
+ public byte[] getForwardBases() {
+ final byte[] bases = getBases();
+ final int mid = locus.getStart() - window.getStart();
+ // todo -- warning of performance problem, especially if this is called over and over
+ return new String(bases).substring(mid).getBytes();
+ }
+
@Deprecated
public char getBaseAsChar() {
return (char)getBase();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java
index 4005f1c32..87b356fce 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalOverlapFilteringIterator.java
@@ -28,6 +28,7 @@ import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import java.util.List;
import java.util.NoSuchElementException;
@@ -154,8 +155,8 @@ class IntervalOverlapFilteringIterator implements CloseableIterator {
}
}
else {
- // Found an unmapped read. We're done.
- if(candidateRead.getReadUnmappedFlag()) {
+ // Found a -L UNMAPPED read. NOTE: this is different than just being flagged as unmapped! We're done.
+ if(AlignmentUtils.isReadGenomeLocUnmapped(candidateRead)) {
nextRead = candidateRead;
break;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index bbcbe6dbc..7f8c35c96 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -167,7 +167,6 @@ public class SAMDataSource {
null,
new ValidationExclusion(),
new ArrayList(),
- false,
false);
}
@@ -185,8 +184,7 @@ public class SAMDataSource {
DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList,
Collection supplementalFilters,
- boolean includeReadsWithDeletionAtLoci,
- boolean generateExtendedEvents) {
+ boolean includeReadsWithDeletionAtLoci) {
this( samFiles,
threadAllocation,
numFileHandles,
@@ -198,7 +196,6 @@ public class SAMDataSource {
exclusionList,
supplementalFilters,
includeReadsWithDeletionAtLoci,
- generateExtendedEvents,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
@@ -215,9 +212,6 @@ public class SAMDataSource {
* @param downsamplingMethod Method for downsampling reads at a given locus.
* @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply.
- * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
- * a pile of indel/noevent extended events at every locus with at least one indel associated with it
- * (in addition to a "regular" call to map() at this locus performed with base pileup)
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently.
@@ -235,7 +229,6 @@ public class SAMDataSource {
ValidationExclusion exclusionList,
Collection supplementalFilters,
boolean includeReadsWithDeletionAtLoci,
- boolean generateExtendedEvents,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
@@ -308,7 +301,6 @@ public class SAMDataSource {
exclusionList,
supplementalFilters,
includeReadsWithDeletionAtLoci,
- generateExtendedEvents,
cmode,
qmode,
refReader,
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
index 0987c5d74..6a9642d97 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadCigarFilter.java
@@ -40,17 +40,26 @@ public class BadCigarFilter extends ReadFilter {
public boolean filterOut(final SAMRecord rec) {
Cigar c = rec.getCigar();
- boolean lastElementWasIndel = false;
- for ( CigarElement ce : c.getCigarElements() ) {
- if ( ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I ) {
- if ( lastElementWasIndel )
- return true;
- lastElementWasIndel = true;
- } else {
- lastElementWasIndel = false;
+ boolean previousElementWasIndel = false;
+ CigarOperator lastOp = c.getCigarElement(0).getOperator();
+
+ if (lastOp == CigarOperator.D) // filter out reads starting with deletion
+ return true;
+
+ for (CigarElement ce : c.getCigarElements()) {
+ CigarOperator op = ce.getOperator();
+ if (op == CigarOperator.D || op == CigarOperator.I) {
+ if (previousElementWasIndel)
+ return true; // filter out reads with adjacent I/D
+
+ previousElementWasIndel = true;
}
+ else // this is a regular base (match/mismatch/hard or soft clip)
+ previousElementWasIndel = false; // reset the previous element
+
+ lastOp = op;
}
- return false;
+ return lastOp == CigarOperator.D;
}
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
index 82cb43634..94051cc7f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009 The Broad Institute
+ * Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@@ -12,7 +12,6 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -99,8 +98,13 @@ public class VCFWriterStub implements Stub, VCFWriter {
/**
* Create a new stub given the requested file.
+ *
+ * @param engine engine.
* @param genotypeFile file to (ultimately) create.
* @param isCompressed should we compress the output stream?
+ * @param argumentSources sources.
+ * @param skipWritingHeader skip writing header.
+ * @param doNotWriteGenotypes do not write genotypes.
*/
public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection