diff --git a/build.xml b/build.xml index fe4c7a3f4..438e9c90c 100644 --- a/build.xml +++ b/build.xml @@ -79,6 +79,17 @@ + + + + + + + + + + + @@ -457,6 +468,26 @@ + + + + + + + + + + + + + + + @@ -489,6 +520,8 @@ + + @@ -957,6 +990,12 @@ + + @@ -1050,7 +1089,7 @@ - + diff --git a/ivy.xml b/ivy.xml index 3f3d1c97f..a394aa6d7 100644 --- a/ivy.xml +++ b/ivy.xml @@ -30,6 +30,9 @@ + + + diff --git a/public/R/src/gsalib/DESCRIPTION b/public/R/src/gsalib/DESCRIPTION new file mode 100644 index 000000000..6116e8c66 --- /dev/null +++ b/public/R/src/gsalib/DESCRIPTION @@ -0,0 +1,10 @@ +Package: gsalib +Type: Package +Title: Utility functions +Version: 1.0 +Date: 2010-10-02 +Author: Kiran Garimella +Maintainer: Kiran Garimella +Description: Utility functions for GATK NGS analyses +License: BSD +LazyLoad: yes diff --git a/public/R/src/gsalib/R/gsa.error.R b/public/R/src/gsalib/R/gsa.error.R new file mode 100644 index 000000000..1c6a56046 --- /dev/null +++ b/public/R/src/gsalib/R/gsa.error.R @@ -0,0 +1,12 @@ +gsa.error <- function(message) { + message(""); + gsa.message("Error: **********"); + gsa.message(sprintf("Error: %s", message)); + gsa.message("Error: **********"); + message(""); + + traceback(); + + message(""); + stop(message, call. = FALSE); +} diff --git a/public/R/src/gsalib/R/gsa.getargs.R b/public/R/src/gsalib/R/gsa.getargs.R new file mode 100644 index 000000000..94613bf93 --- /dev/null +++ b/public/R/src/gsalib/R/gsa.getargs.R @@ -0,0 +1,116 @@ +.gsa.getargs.usage <- function(argspec, doc) { + cargs = commandArgs(); + + usage = "Usage:"; + + fileIndex = grep("--file=", cargs); + if (length(fileIndex) > 0) { + progname = gsub("--file=", "", cargs[fileIndex[1]]); + + usage = sprintf("Usage: Rscript %s [arguments]", progname); + + if (!is.na(doc)) { + message(sprintf("%s: %s\n", progname, doc)); + } + } + + message(usage); + + for (argname in names(argspec)) { + key = argname; + defaultValue = 0; + doc = ""; + + if (is.list(argspec[[argname]])) { + defaultValue = argspec[[argname]]$value; + doc = argspec[[argname]]$doc; + } + + message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc)); + } + + message(""); + + stop(call. = FALSE); +} + +gsa.getargs <- function(argspec, doc = NA) { + argsenv = new.env(); + + for (argname in names(argspec)) { + value = 0; + if (is.list(argspec[[argname]])) { + value = argspec[[argname]]$value; + } else { + value = argspec[[argname]]; + } + + assign(argname, value, envir=argsenv); + } + + if (interactive()) { + for (argname in names(argspec)) { + value = get(argname, envir=argsenv); + + if (is.na(value) | is.null(value)) { + if (exists("cmdargs")) { + assign(argname, cmdargs[[argname]], envir=argsenv); + } else { + assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv); + } + } else { + assign(argname, value, envir=argsenv); + } + } + } else { + cargs = commandArgs(TRUE); + + if (length(cargs) == 0) { + .gsa.getargs.usage(argspec, doc); + } + + for (i in 1:length(cargs)) { + if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) { + key = gsub("-", "", cargs[i]); + value = cargs[i+1]; + + if (key == "h" | key == "help") { + .gsa.getargs.usage(argspec, doc); + } + + if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) { + value = as.numeric(value); + } + + assign(key, value, envir=argsenv); + } + } + } + + args = as.list(argsenv); + + isMissingArgs = 0; + missingArgs = c(); + + for (arg in names(argspec)) { + if (is.na(args[[arg]]) | is.null(args[[arg]])) { + gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg)); + + isMissingArgs = 1; + missingArgs = c(missingArgs, arg); + } + } + + if (isMissingArgs) { + gsa.error( + paste( + "Missing required arguments: -", + paste(missingArgs, collapse=" -"), + ". Specify -h or -help to this script for a list of available arguments.", + sep="" + ) + ); + } + + args; +} diff --git a/public/R/src/gsalib/R/gsa.message.R b/public/R/src/gsalib/R/gsa.message.R new file mode 100644 index 000000000..a2b909d3d --- /dev/null +++ b/public/R/src/gsalib/R/gsa.message.R @@ -0,0 +1,3 @@ +gsa.message <- function(message) { + message(sprintf("[gsalib] %s", message)); +} diff --git a/public/R/src/gsalib/R/gsa.plot.venn.R b/public/R/src/gsalib/R/gsa.plot.venn.R new file mode 100644 index 000000000..b1353ccc1 --- /dev/null +++ b/public/R/src/gsalib/R/gsa.plot.venn.R @@ -0,0 +1,50 @@ +gsa.plot.venn <- +function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0, + col=c("#FF6342", "#63C6DE", "#ADDE63"), + pos=c(0.20, 0.20, 0.80, 0.82), + debug=0 + ) { + library(png); + library(graphics); + + # Set up properties + for (i in 1:length(col)) { + rgbcol = col2rgb(col[i]); + col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]); + } + + chco = paste(col[1], col[2], col[3], sep=","); + chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=","); + + props = c( + 'cht=v', + 'chs=525x525', + 'chds=0,10000000000', + paste('chco=', chco, sep=""), + paste('chd=t:', chd, sep="") + ); + proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&'); + + # Get the venn diagram (as a temporary file) + filename = tempfile("venn"); + cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep=""); + + if (debug == 1) { + print(cmd); + } + system(cmd); + + # Render the temp png file into a plotting frame + a = readPNG(filename); + + plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab=""); + if (c == 0 || a >= b) { + rasterImage(a, pos[1], pos[2], pos[3], pos[4]); + } else { + rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180); + } + + # Clean up! + unlink(filename); +} + diff --git a/public/R/src/gsalib/R/gsa.read.eval.R b/public/R/src/gsalib/R/gsa.read.eval.R new file mode 100644 index 000000000..f1d49092b --- /dev/null +++ b/public/R/src/gsalib/R/gsa.read.eval.R @@ -0,0 +1,83 @@ +.gsa.attemptToLoadFile <- function(filename) { + file = NA; + + if (file.exists(filename) & file.info(filename)$size > 500) { + file = read.csv(filename, header=TRUE, comment.char="#"); + } + + file; +} + +gsa.read.eval <- +function(evalRoot) { + fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); + fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); + fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); + fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); + fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); + fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); + fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); + fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); + fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); + fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); + fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); + fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); + fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); + fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); + + eval = list( + AlleleCountStats = NA, + CompOverlap = NA, + CountVariants = NA, + GenotypeConcordance = NA, + MetricsByAc = NA, + MetricsBySample = NA, + Quality_Metrics_by_allele_count = NA, + QualityScoreHistogram = NA, + SampleStatistics = NA, + SampleSummaryStatistics = NA, + SimpleMetricsBySample = NA, + TiTv = NA, + TiTvStats = NA, + Variant_Quality_Score = NA, + + CallsetNames = c(), + CallsetOnlyNames = c(), + CallsetFilteredNames = c() + ); + + eval$AlleleCountStats = .gsa.attemptToLoadFile(fileAlleleCountStats); + eval$CompOverlap = .gsa.attemptToLoadFile(fileCompOverlap); + eval$CountVariants = .gsa.attemptToLoadFile(fileCountVariants); + eval$GenotypeConcordance = .gsa.attemptToLoadFile(fileGenotypeConcordance); + eval$MetricsByAc = .gsa.attemptToLoadFile(fileMetricsByAc); + eval$MetricsBySample = .gsa.attemptToLoadFile(fileMetricsBySample); + eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count); + eval$QualityScoreHistogram = .gsa.attemptToLoadFile(fileQualityScoreHistogram); + eval$SampleStatistics = .gsa.attemptToLoadFile(fileSampleStatistics); + eval$SampleSummaryStatistics = .gsa.attemptToLoadFile(fileSampleSummaryStatistics); + eval$SimpleMetricsBySample = .gsa.attemptToLoadFile(fileSimpleMetricsBySample); + eval$TiTv = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); + eval$TiTvStats = .gsa.attemptToLoadFile(fileTiTvStats); + eval$Variant_Quality_Score = .gsa.attemptToLoadFile(fileVariant_Quality_Score); + + uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); + eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); + eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); + eval$CallsetFilteredNames = as.vector(c( + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""), + paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep="")) + ); + + if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) { + eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep=""); + } + + if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) { + eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep=""); + #eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep=""); + } + + eval; +} + diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R new file mode 100644 index 000000000..011b5240d --- /dev/null +++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R @@ -0,0 +1,103 @@ +# Load a table into the specified environment. Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last. +.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) { + d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE); + colnames(d) = tableHeader; + + for (i in 1:ncol(d)) { + v = suppressWarnings(as.numeric(d[,i])); + + if (length(na.omit(as.numeric(v))) == length(d[,i])) { + d[,i] = v; + } + } + + usedNames = ls(envir=tableEnv, pattern=tableName); + + if (length(usedNames) > 0) { + tableName = paste(tableName, ".", length(usedNames), sep=""); + } + + assign(tableName, d, envir=tableEnv); +} + +# Read a fixed width line of text into a list. +.gsa.splitFixedWidth <- function(line, columnStarts) { + splitStartStop <- function(x) { + x = substring(x, starts, stops); + x = gsub("^[[:space:]]+|[[:space:]]+$", "", x); + x; + } + + starts = c(1, columnStarts); + stops = c(columnStarts - 1, nchar(line)); + + sapply(line, splitStartStop)[,1]; +} + +# Load all GATKReport tables from a file +gsa.read.gatkreport <- function(filename) { + con = file(filename, "r", blocking = TRUE); + lines = readLines(con); + close(con); + + tableEnv = new.env(); + + tableName = NA; + tableHeader = c(); + tableRows = c(); + version = NA; + + for (line in lines) { + if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) { + headerFields = unlist(strsplit(line, "[[:space:]]+")); + + if (!is.na(tableName)) { + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); + } + + tableName = headerFields[2]; + tableHeader = c(); + tableRows = c(); + + # For differences in versions see + # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java + if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.1"; + + } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) { + version = "v0.2"; + columnStarts = c(); + + } + + } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) { + # do nothing + } else if (!is.na(tableName)) { + + if (version == "v0.1") { + row = unlist(strsplit(line, "[[:space:]]+")); + + } else if (version == "v0.2") { + if (length(tableHeader) == 0) { + headerChars = unlist(strsplit(line, "")); + # Find the first position of non space characters, excluding the first character + columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1); + } + + row = .gsa.splitFixedWidth(line, columnStarts); + } + + if (length(tableHeader) == 0) { + tableHeader = row; + } else { + tableRows = rbind(tableRows, row); + } + } + } + + if (!is.na(tableName)) { + .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); + } + + gatkreport = as.list(tableEnv); +} diff --git a/public/R/src/gsalib/R/gsa.read.squidmetrics.R b/public/R/src/gsalib/R/gsa.read.squidmetrics.R new file mode 100644 index 000000000..39fa1ad32 --- /dev/null +++ b/public/R/src/gsalib/R/gsa.read.squidmetrics.R @@ -0,0 +1,28 @@ +gsa.read.squidmetrics = function(project, bylane = FALSE) { + suppressMessages(library(ROracle)); + + drv = dbDriver("Oracle"); + con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD"); + + if (bylane) { + statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep=""); + print(statement); + + rs = dbSendQuery(con, statement = statement); + d = fetch(rs, n=-1); + dbHasCompleted(rs); + dbClearResult(rs); + } else { + statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep=""); + print(statement); + + rs = dbSendQuery(con, statement = statement); + d = fetch(rs, n=-1); + dbHasCompleted(rs); + dbClearResult(rs); + } + + oraCloseDriver(drv); + + subset(d, Project == project); +} diff --git a/public/R/src/gsalib/R/gsa.read.vcf.R b/public/R/src/gsalib/R/gsa.read.vcf.R new file mode 100644 index 000000000..5beb6455d --- /dev/null +++ b/public/R/src/gsalib/R/gsa.read.vcf.R @@ -0,0 +1,23 @@ +gsa.read.vcf <- function(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE) { + headers = readLines(vcffile, n=100); + headerline = headers[grep("#CHROM", headers)]; + header = unlist(strsplit(gsub("#", "", headerline), "\t")) + + d = read.table(vcffile, header=FALSE, skip=skip, nrows=nrows, stringsAsFactors=FALSE); + colnames(d) = header; + + if (expandGenotypeFields) { + columns = ncol(d); + + offset = columns + 1; + for (sampleIndex in 10:columns) { + gt = unlist(lapply(strsplit(d[,sampleIndex], ":"), function(x) x[1])); + d[,offset] = gt; + colnames(d)[offset] = sprintf("%s.GT", colnames(d)[sampleIndex]); + + offset = offset + 1; + } + } + + return(d); +} diff --git a/public/R/src/gsalib/R/gsa.warn.R b/public/R/src/gsalib/R/gsa.warn.R new file mode 100644 index 000000000..7ee08ce65 --- /dev/null +++ b/public/R/src/gsalib/R/gsa.warn.R @@ -0,0 +1,3 @@ +gsa.warn <- function(message) { + gsa.message(sprintf("Warning: %s", message)); +} diff --git a/public/R/src/gsalib/Read-and-delete-me b/public/R/src/gsalib/Read-and-delete-me new file mode 100644 index 000000000..d04323a6e --- /dev/null +++ b/public/R/src/gsalib/Read-and-delete-me @@ -0,0 +1,9 @@ +* Edit the help file skeletons in 'man', possibly combining help files + for multiple functions. +* Put any C/C++/Fortran code in 'src'. +* If you have compiled code, add a .First.lib() function in 'R' to load + the shared library. +* Run R CMD build to build the package tarball. +* Run R CMD check to check the package tarball. + +Read "Writing R Extensions" for more information. diff --git a/public/R/src/gsalib/data/tearsheetdrop.jpg b/public/R/src/gsalib/data/tearsheetdrop.jpg new file mode 100755 index 000000000..c9d480fa0 Binary files /dev/null and b/public/R/src/gsalib/data/tearsheetdrop.jpg differ diff --git a/public/R/src/gsalib/man/gsa.error.Rd b/public/R/src/gsalib/man/gsa.error.Rd new file mode 100644 index 000000000..df7c0cbde --- /dev/null +++ b/public/R/src/gsalib/man/gsa.error.Rd @@ -0,0 +1,49 @@ +\name{gsa.error} +\alias{gsa.error} +\title{ +GSA error +} +\description{ +Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit. +} +\usage{ +gsa.error(message) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{message}{ +The error message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +gsa.error("This is a message"); +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/public/R/src/gsalib/man/gsa.getargs.Rd b/public/R/src/gsalib/man/gsa.getargs.Rd new file mode 100644 index 000000000..27aa1b05a --- /dev/null +++ b/public/R/src/gsalib/man/gsa.getargs.Rd @@ -0,0 +1,57 @@ +\name{gsa.getargs} +\alias{gsa.getargs} +\title{ +Get script arguments +} +\description{ +Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit. +} +\usage{ +gsa.getargs(argspec, doc = NA) +} +\arguments{ + \item{argspec}{ +A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example: +\preformatted{argspec = list( + arg1 = list(value=10, doc="Info for optional arg1"), + arg2 = list(value=NA, doc="Info for required arg2") +); +} + +If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for. +} + + \item{doc}{ +An optional string succinctly documenting the purpose of the script. +} +} +\details{ +Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes. + +To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided. +} +\value{ +Returns a list with keys matching the argspec and values representing the specified arguments. + +\item{arg1 }{Value for argument 1} +\item{arg2 }{Value for argument 2} +...etc. +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\examples{ +argspec = list( + file = list(value="/my/test.vcf", doc="VCF file"), + verbose = list(value=0, doc="If 1, set verbose mode"), + test2 = list(value=2.3e9, doc="Another argument that does stuff") +); + +cmdargs = gsa.getargs(argspec, doc="My test program"); + +print(cmdargs$file); # will print '[1] "/my/test.vcf"' +} +\keyword{ ~kwd1 } diff --git a/public/R/src/gsalib/man/gsa.message.Rd b/public/R/src/gsalib/man/gsa.message.Rd new file mode 100644 index 000000000..9752de8a9 --- /dev/null +++ b/public/R/src/gsalib/man/gsa.message.Rd @@ -0,0 +1,44 @@ +\name{gsa.message} +\alias{gsa.message} +\title{ +GSA message +} +\description{ +Write a message to standard out with the prefix '[gsalib]'. +} +\usage{ +gsa.message(message) +} +\arguments{ + \item{message}{ +The message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Write message to stdout +gsa.message("This is a message"); +} +\keyword{ ~kwd1 } diff --git a/public/R/src/gsalib/man/gsa.plot.venn.Rd b/public/R/src/gsalib/man/gsa.plot.venn.Rd new file mode 100644 index 000000000..bf4feb5bc --- /dev/null +++ b/public/R/src/gsalib/man/gsa.plot.venn.Rd @@ -0,0 +1,75 @@ +\name{gsa.plot.venn} +\alias{gsa.plot.venn} +\title{ +Plot a proportional venn diagram +} +\description{ +Plot a proportional venn diagram (two or three-way venns allowed) +} +\usage{ +gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0) +} +\arguments{ + \item{a}{ +size of 'a' circle +} + \item{b}{ +size of 'b' circle +} + \item{c}{ +size of 'c' circle +} + \item{a_and_b}{ +size of a and b overlap +} + \item{a_and_c}{ +size of a and c overlap +} + \item{b_and_c}{ +size of b and c overlap +} + \item{col}{ +vector of colors for each venn piece +} + \item{pos}{ +vector of positional elements +} + \item{debug}{ +if 1, set debug mode and print useful information +} +} +\details{ +Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways. +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Plot a two-way Venn diagram +gsa.plot.venn(1000, 750, 0, 400); + +## Plot a three-way Venn diagram +gsa.plot.venn(1000, 750, 900, 400, 650, 500); +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/public/R/src/gsalib/man/gsa.read.eval.Rd b/public/R/src/gsalib/man/gsa.read.eval.Rd new file mode 100644 index 000000000..0e2baba73 --- /dev/null +++ b/public/R/src/gsalib/man/gsa.read.eval.Rd @@ -0,0 +1,111 @@ +\name{gsa.read.eval} +\alias{gsa.read.eval} +\title{ +Read a VariantEval file +} +\description{ +Read a VariantEval file that's output in R format. +} +\usage{ +gsa.read.eval(evalRoot) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{evalRoot}{ +%% ~~Describe \code{evalRoot} here~~ +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +%% ~~who you are~~ +} +\note{ +%% ~~further notes~~ +} + +%% ~Make other sections like Warning with \section{Warning }{....} ~ + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +##---- Should be DIRECTLY executable !! ---- +##-- ==> Define data, use random, +##-- or do help(data=index) for the standard data sets. + +## The function is currently defined as +function(evalRoot) { + fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); + fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); + fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); + fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); + fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); + fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep=""); + fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); + fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); + fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); + fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); + fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep=""); + fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); + fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); + fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); + + eval = list( + AlleleCountStats = NA, + CompOverlap = NA, + CountVariants = NA, + GenotypeConcordance = NA, + MetricsByAc = NA, + MetricsBySample = NA, + Quality_Metrics_by_allele_count = NA, + QualityScoreHistogram = NA, + SampleStatistics = NA, + SampleSummaryStatistics = NA, + SimpleMetricsBySample = NA, + TiTv = NA, + TiTvStats = NA, + Variant_Quality_Score = NA, + + CallsetNames = c(), + CallsetOnlyNames = c(), + CallsetFilteredNames = c() + ); + + eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats); + eval$CompOverlap = .attemptToLoadFile(fileCompOverlap); + eval$CountVariants = .attemptToLoadFile(fileCountVariants); + eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance); + eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc); + eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample); + eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count); + eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram); + eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics); + eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics); + eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample); + eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator); + eval$TiTvStats = .attemptToLoadFile(fileTiTvStats); + eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score); + + uniqueJexlExpressions = unique(eval$TiTv$jexl_expression); + eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]); + eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames)); + eval$CallsetFilteredNames = as.vector(c()); + eval; + } +} +% Add one or more standard keywords, see file 'KEYWORDS' in the +% R documentation directory. +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/public/R/src/gsalib/man/gsa.read.gatkreport.Rd b/public/R/src/gsalib/man/gsa.read.gatkreport.Rd new file mode 100644 index 000000000..67c2c7b28 --- /dev/null +++ b/public/R/src/gsalib/man/gsa.read.gatkreport.Rd @@ -0,0 +1,55 @@ +\name{gsa.read.gatkreport} +\alias{gsa.read.gatkreport} +\title{ +gsa.read.gatkreport +} +\description{ +Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list. +} +\usage{ +gsa.read.gatkreport(filename) +} +\arguments{ + \item{filename}{ +The path to the GATKReport file. +} +} +\details{ +The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function. + +The file format looks like this: +\preformatted{##:GATKReport.v0.1 TableName : The description of the table +col1 col2 col3 +0 0.007451835696110506 25.474613284804366 +1 0.002362777171937477 29.844949954504095 +2 9.087604507451836E-4 32.87590975254731 +3 5.452562704471102E-4 34.498999090081895 +4 9.087604507451836E-4 35.14831665150137 +} + +} +\value{ +Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN". +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +report = gsa.read.gatkreport("/path/to/my/output.gatkreport"); +} +\keyword{ ~kwd1 } diff --git a/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd b/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd new file mode 100644 index 000000000..0a8b37843 --- /dev/null +++ b/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd @@ -0,0 +1,48 @@ +\name{gsa.read.squidmetrics} +\alias{gsa.read.squidmetrics} +\title{ +gsa.read.squidmetrics +} +\description{ +Reads metrics for a specified SQUID project into a dataframe. +} +\usage{ +gsa.read.squidmetrics("C315") +} +\arguments{ + \item{project}{ +The project for which metrics should be obtained. +} + \item{bylane}{ +If TRUE, obtains per-lane metrics rather than the default per-sample metrics. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +Returns a data frame with samples (or lanes) as the row and the metric as the column. +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +This method will only work within the Broad Institute internal network. +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Obtain metrics for project C315. +d = gsa.read.squidmetrics("C315"); +} +\keyword{ ~kwd1 } diff --git a/public/R/src/gsalib/man/gsa.read.vcf.Rd b/public/R/src/gsalib/man/gsa.read.vcf.Rd new file mode 100644 index 000000000..cffd35e8f --- /dev/null +++ b/public/R/src/gsalib/man/gsa.read.vcf.Rd @@ -0,0 +1,53 @@ +\name{gsa.read.vcf} +\alias{gsa.read.vcf} +\title{ +gsa.read.vcf +} +\description{ +Reads a VCF file into a table. Optionally expands genotype columns into separate columns containing the genotype, separate from the other fields specified in the FORMAT field. +} +\usage{ +gsa.read.vcf(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE) +} +\arguments{ + \item{vcffile}{ +The path to the vcf file. +} + \item{skip}{ +The number of lines of the data file to skip before beginning to read data. +} + \item{nrows}{ +The maximum number of rows to read in. Negative and other invalid values are ignored. +} + \item{expandGenotypeFields}{ +If TRUE, adds an additional column per sample containing just the genotype. +} +} +\details{ +The VCF format is the standard variant call file format used in the GATK. This function reads that data in as a table for easy analysis. +} +\value{ +Returns a data.frame object, where each column corresponds to the columns in the VCF file. +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +vcf = gsa.read.vcf("/path/to/my/output.vcf"); +} +\keyword{ ~kwd1 } diff --git a/public/R/src/gsalib/man/gsa.warn.Rd b/public/R/src/gsalib/man/gsa.warn.Rd new file mode 100644 index 000000000..0b9770b5c --- /dev/null +++ b/public/R/src/gsalib/man/gsa.warn.Rd @@ -0,0 +1,46 @@ +\name{gsa.warn} +\alias{gsa.warn} +\title{ +GSA warn +} +\description{ +Write a warning message to standard out with the prefix '[gsalib] Warning:'. +} +\usage{ +gsa.warn(message) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{message}{ +The warning message to write. +} +} +\details{ +%% ~~ If necessary, more details than the description above ~~ +} +\value{ +%% ~Describe the value returned +%% If it is a LIST, use +%% \item{comp1 }{Description of 'comp1'} +%% \item{comp2 }{Description of 'comp2'} +%% ... +} +\references{ +%% ~put references to the literature/web site here ~ +} +\author{ +Kiran Garimella +} +\note{ +%% ~~further notes~~ +} + +\seealso{ +%% ~~objects to See Also as \code{\link{help}}, ~~~ +} +\examples{ +## Write message to stdout +gsa.warn("This is a warning message"); +} +\keyword{ ~kwd1 } +\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line diff --git a/public/R/src/gsalib/man/gsalib-package.Rd b/public/R/src/gsalib/man/gsalib-package.Rd new file mode 100644 index 000000000..2b8d6db9f --- /dev/null +++ b/public/R/src/gsalib/man/gsalib-package.Rd @@ -0,0 +1,68 @@ +\name{gsalib-package} +\alias{gsalib-package} +\alias{gsalib} +\docType{package} +\title{ +GATK utility analysis functions +} +\description{ +Utility functions for analyzing GATK-processed NGS data +} +\details{ +This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more. +} +\author{ +Genome Sequencing and Analysis Group + +Medical and Population Genetics Program + +Maintainer: Kiran Garimella +} +\references{ +GSA wiki page: http://www.broadinstitute.org/gsa/wiki + +GATK help forum: http://www.getsatisfaction.com/gsa +} +\examples{ +## get script arguments in interactive and non-interactive mode +cmdargs = gsa.getargs( list( + requiredArg1 = list( + value = NA, + doc = "Documentation for requiredArg1" + ), + + optionalArg1 = list( + value = 3e9, + doc = "Documentation for optionalArg1" + ) +) ); + +## plot a proportional Venn diagram +gsa.plot.venn(500, 250, 0, 100); + +## read a GATKReport file +report = gsa.gatk.report("/path/to/my/output.gatkreport"); + +## emit a message +gsa.message("This is a message"); + +## emit a warning message +gsa.message("This is a warning message"); + +## emit an error message +gsa.message("This is an error message"); + +## read the SQUID metrics for a given sequencing project (internal to the Broad only) +s = gsa.read.squidmetrics("C427"); + +## read command-line arguments +cmdargs = gsa.getargs( + list( + file = list(value="/my/test.vcf", doc="VCF file"), + verbose = list(value=0, doc="If 1, set verbose mode"), + test2 = list(value=2.3e9, doc="Another argument that does stuff") + ), + doc="My test program" +); +} +\keyword{ package } diff --git a/public/c/SeparateQltout.cc b/public/c/SeparateQltout.cc new file mode 100644 index 000000000..7644c9603 --- /dev/null +++ b/public/c/SeparateQltout.cc @@ -0,0 +1,70 @@ +#include "MainTools.h" +#include "Basevector.h" +#include "lookup/LookAlign.h" +#include "lookup/SerialQltout.h" + +unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) { + //la.PrintParseable(cout); + + for (int i = 0; i < candidates.size(); i++) { + look_align newla = la; + + if (newla.rc1) { candidates[i].ReverseComplement(); } + newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]); + + //newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]); + //cout << newla.Errors() << " " << la.Errors() << endl; + + if (newla.Errors() == la.Errors()) { + return i; + } + } + + //FatalErr("Query id " + ToString(la.query_id) + " had no matches."); + + return candidates.size() + 1; +} + +int main(int argc, char **argv) { + RunTime(); + + BeginCommandArguments; + CommandArgument_String(ALIGNS); + CommandArgument_String(FASTB_END_1); + CommandArgument_String(FASTB_END_2); + CommandArgument_String(REFERENCE); + + CommandArgument_String(ALIGNS_END_1_OUT); + CommandArgument_String(ALIGNS_END_2_OUT); + EndCommandArguments; + + vecbasevector ref(REFERENCE); + vecbasevector reads1(FASTB_END_1); + vecbasevector reads2(FASTB_END_2); + + ofstream aligns1stream(ALIGNS_END_1_OUT.c_str()); + ofstream aligns2stream(ALIGNS_END_2_OUT.c_str()); + + basevector bv; + + SerialQltout sqltout(ALIGNS); + look_align la; + while (sqltout.Next(la)) { + vecbasevector candidates(2); + candidates[0] = reads1[la.query_id]; + candidates[1] = reads2[la.query_id]; + + unsigned int matchingend = MatchingEnd(la, candidates, ref); + if (matchingend < 2) { + bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id]; + + //la.PrintParseable(cout, &bv, &ref[la.target_id]); + la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]); + } + } + + aligns1stream.close(); + aligns2stream.close(); + + return 0; +} diff --git a/public/c/bwa/Makefile b/public/c/bwa/Makefile new file mode 100644 index 000000000..6399a0e6d --- /dev/null +++ b/public/c/bwa/Makefile @@ -0,0 +1,21 @@ +CXX=g++ +CXXFLAGS=-g -Wall -O2 -m64 -fPIC + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@ + +all: init lib + +init: + @echo Please make sure the following platforms are set correctly on your machine. + @echo BWA_HOME=$(BWA_HOME) + @echo JAVA_INCLUDE=$(JAVA_INCLUDE) + @echo TARGET_LIB=$(TARGET_LIB) + @echo EXTRA_LIBS=$(EXTRA_LIBS) + @echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND) + +lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o + $(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS) + +clean: + rm *.o libbwa.* diff --git a/public/c/bwa/build_linux.sh b/public/c/bwa/build_linux.sh new file mode 100755 index 000000000..b3631a28d --- /dev/null +++ b/public/c/bwa/build_linux.sh @@ -0,0 +1,7 @@ +#!/bin/sh +export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa-trunk/bwa" +export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux" +export TARGET_LIB="libbwa.so" +export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread" +export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so" +make diff --git a/public/c/bwa/build_mac.sh b/public/c/bwa/build_mac.sh new file mode 100644 index 000000000..bfed900bb --- /dev/null +++ b/public/c/bwa/build_mac.sh @@ -0,0 +1,7 @@ +#!/bin/sh +export BWA_HOME="/Users/mhanna/src/bwa" +export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers" +export TARGET_LIB="libbwa.dylib" +export EXTRA_LIBS="-lc -lz -lsupc++" +export LIBTOOL_COMMAND="libtool -dynamic" +make diff --git a/public/c/bwa/bwa_gateway.cpp b/public/c/bwa/bwa_gateway.cpp new file mode 100644 index 000000000..00f5aa5bc --- /dev/null +++ b/public/c/bwa/bwa_gateway.cpp @@ -0,0 +1,277 @@ +#include +#include +#include + +#include "bwase.h" +#include "bwa_gateway.h" + +BWA::BWA(const char* ann_filename, + const char* amb_filename, + const char* pac_filename, + const char* forward_bwt_filename, + const char* forward_sa_filename, + const char* reverse_bwt_filename, + const char* reverse_sa_filename) +{ + // Load the bns (?) and reference + bns = bns_restore_core(ann_filename,amb_filename,pac_filename); + reference = new ubyte_t[bns->l_pac/4+1]; + rewind(bns->fp_pac); + fread(reference, 1, bns->l_pac/4+1, bns->fp_pac); + fclose(bns->fp_pac); + bns->fp_pac = NULL; + + // Load the BWTs (both directions) and suffix arrays (both directions) + bwts[0] = bwt_restore_bwt(forward_bwt_filename); + bwt_restore_sa(forward_sa_filename, bwts[0]); + bwts[1] = bwt_restore_bwt(reverse_bwt_filename); + bwt_restore_sa(reverse_sa_filename, bwts[1]); + load_default_options(); + + // Always reinitialize the random seed whenever a new set of files are loaded. + initialize_random_seed(); + + // initialize the bwase subsystem + bwase_initialize(); +} + +BWA::~BWA() { + delete[] reference; + bns_destroy(bns); + bwt_destroy(bwts[0]); + bwt_destroy(bwts[1]); +} + +void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count) +{ + bwa_seq_t* sequence = create_sequence(bases, read_length); + + // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln). + // This method will destroy the contents of seq and rseq. + bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); + + paths = new bwt_aln1_t[sequence->n_aln]; + memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t)); + num_paths = sequence->n_aln; + + // Call aln2seq to initialize the type of match present. + bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); + best_path_count = sequence->c1; + second_best_path_count = sequence->c2; + + bwa_free_read_seq(1,sequence); +} + +Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) { + bwa_seq_t* sequence = create_sequence(bases,read_length); + + // Calculate paths. + bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options); + + // Check for no alignments found and return null. + if(sequence->n_aln == 0) { + bwa_free_read_seq(1,sequence); + return NULL; + } + + // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in. + copy_bases_into_sequence(sequence,bases,read_length); + + // Pick best alignment and propagate its information into the sequence. + bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); + + // Generate the best alignment from the sequence. + Alignment* alignment = new Alignment; + *alignment = generate_final_alignment_from_sequence(sequence); + + bwa_free_read_seq(1,sequence); + + return alignment; +} + +void BWA::generate_alignments_from_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t* paths, + const unsigned num_paths, + const unsigned best_count, + const unsigned second_best_count, + Alignment*& alignments, + unsigned& num_alignments) +{ + bwa_seq_t* sequence = create_sequence(bases,read_length); + + sequence->aln = paths; + sequence->n_aln = num_paths; + + // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself. + bwa_aln2seq(sequence->n_aln,sequence->aln,sequence); + + // But overwrite key parts of the sequence in case the user passed back only a smaller subset + // of the paths. + sequence->c1 = best_count; + sequence->c2 = second_best_count; + sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + + num_alignments = 0; + for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++) + num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1; + + alignments = new Alignment[num_alignments]; + unsigned alignment_idx = 0; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + // Stub in a 'working' path, so that only the desired alignment is local-aligned. + const bwt_aln1_t* path = paths + path_idx; + bwt_aln1_t working_path = *path; + + // Loop through all alignments, aligning each one individually. + for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) { + working_path.k = working_path.l = sa_idx; + sequence->aln = &working_path; + sequence->n_aln = 1; + + sequence->sa = sa_idx; + sequence->strand = path->a; + sequence->score = path->score; + + // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse. + // TODO: Fix the interface to bwa_refine_gapped so its easier to work with. + if(alignment_idx > 0) + seq_reverse(sequence->len, sequence->seq, 0); + + // Copy the local alignment data into the alignment object. + *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence); + + alignment_idx++; + } + } + + sequence->aln = NULL; + sequence->n_aln = 0; + + bwa_free_read_seq(1,sequence); +} + +Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) { + // Calculate the local coordinate and local alignment. + bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr); + bwa_refine_gapped(bns, 1, sequence, reference, NULL); + + // Copy the local alignment data into the alignment object. + Alignment alignment; + + // Populate basic path info + alignment.edit_distance = sequence->nm; + alignment.num_mismatches = sequence->n_mm; + alignment.num_gap_opens = sequence->n_gapo; + alignment.num_gap_extensions = sequence->n_gape; + alignment.num_best = sequence->c1; + alignment.num_second_best = sequence->c2; + + // Final alignment position. + alignment.type = sequence->type; + bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig); + alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1; + alignment.negative_strand = sequence->strand; + alignment.mapping_quality = sequence->mapQ; + + // Cigar step. + alignment.cigar = NULL; + if(sequence->cigar) { + alignment.cigar = new uint16_t[sequence->n_cigar]; + memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t)); + } + alignment.n_cigar = sequence->n_cigar; + + // MD tag with a better breakdown of differences in the cigar + alignment.md = strdup(sequence->md); + delete[] sequence->md; + sequence->md = NULL; + + return alignment; +} + +void BWA::load_default_options() +{ + options.s_mm = 3; + options.s_gapo = 11; + options.s_gape = 4; + options.mode = 3; + options.indel_end_skip = 5; + options.max_del_occ = 10; + options.max_entries = 2000000; + options.fnr = 0.04; + options.max_diff = -1; + options.max_gapo = 1; + options.max_gape = 6; + options.max_seed_diff = 2; + options.seed_len = 2147483647; + options.n_threads = 1; + options.max_top2 = 30; + options.trim_qual = 0; +} + +void BWA::initialize_random_seed() +{ + srand48(bns->seed); +} + +void BWA::set_max_edit_distance(float edit_distance) { + if(edit_distance > 0 && edit_distance < 1) { + options.fnr = edit_distance; + options.max_diff = -1; + } + else { + options.fnr = -1.0; + options.max_diff = (int)edit_distance; + } +} + +void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; } +void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; } +void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; } +void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; } +void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; } +void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; } + +/** + * Create a sequence with a set of reasonable initial defaults. + * Will leave seq and rseq empty. + */ +bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length) +{ + bwa_seq_t* sequence = new bwa_seq_t; + + sequence->tid = -1; + + sequence->name = 0; + + copy_bases_into_sequence(sequence, bases, read_length); + + sequence->qual = 0; + sequence->aln = 0; + sequence->md = 0; + + sequence->cigar = NULL; + sequence->n_cigar = 0; + + sequence->multi = NULL; + sequence->n_multi = 0; + + return sequence; +} + +void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length) +{ + // seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap + sequence->seq = new ubyte_t[read_length]; + sequence->rseq = new ubyte_t[read_length]; + for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]]; + memcpy(sequence->rseq,sequence->seq,read_length); + + // BWA expects the read bases to arrive reversed. + seq_reverse(read_length,sequence->seq,0); + seq_reverse(read_length,sequence->rseq,1); + + sequence->full_len = sequence->len = read_length; +} diff --git a/public/c/bwa/bwa_gateway.h b/public/c/bwa/bwa_gateway.h new file mode 100644 index 000000000..2d26ec650 --- /dev/null +++ b/public/c/bwa/bwa_gateway.h @@ -0,0 +1,83 @@ +#ifndef BWA_GATEWAY +#define BWA_GATEWAY + +#include + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" + +class Alignment { + public: + uint32_t type; + int contig; + bwtint_t pos; + bool negative_strand; + uint32_t mapping_quality; + + uint16_t *cigar; + int n_cigar; + + uint8_t num_mismatches; + uint8_t num_gap_opens; + uint8_t num_gap_extensions; + uint16_t edit_distance; + + uint32_t num_best; + uint32_t num_second_best; + + char* md; +}; + +class BWA { + private: + bntseq_t *bns; + ubyte_t* reference; + bwt_t* bwts[2]; + gap_opt_t options; + + void load_default_options(); + void initialize_random_seed(); + bwa_seq_t* create_sequence(const char* bases, const unsigned read_length); + void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length); + Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence); + + public: + BWA(const char* ann_filename, + const char* amb_filename, + const char* pac_filename, + const char* forward_bwt_filename, + const char* forward_sa_filename, + const char* reverse_bwt_filename, + const char* reverse_sa_filename); + ~BWA(); + + // Parameterize the aligner. + void set_max_edit_distance(float edit_distance); + void set_max_gap_opens(int max_gap_opens); + void set_max_gap_extensions(int max_gap_extensions); + void set_disallow_indel_within_range(int indel_range); + void set_mismatch_penalty(int penalty); + void set_gap_open_penalty(int penalty); + void set_gap_extension_penalty(int penalty); + + // Perform the alignment + Alignment* generate_single_alignment(const char* bases, + const unsigned read_length); + void find_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t*& paths, + unsigned& num_paths, + unsigned& best_path_count, + unsigned& second_best_path_count); + void generate_alignments_from_paths(const char* bases, + const unsigned read_length, + bwt_aln1_t* paths, + const unsigned num_paths, + const unsigned best_count, + const unsigned second_best_count, + Alignment*& alignments, + unsigned& num_alignments); +}; + +#endif // BWA_GATEWAY diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp new file mode 100644 index 000000000..1ccbef0d4 --- /dev/null +++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp @@ -0,0 +1,437 @@ +#include +#include +#include + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" +#include "bwa_gateway.h" +#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h" + +typedef void (BWA::*int_setter)(int value); +typedef void (BWA::*float_setter)(float value); + +static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment); +static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name); +static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter); +static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter); +static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message); + +JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration) +{ + jstring java_ann = get_configuration_file(env,bwtFiles,"annFile"); + if(java_ann == NULL) return 0L; + jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile"); + if(java_amb == NULL) return 0L; + jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile"); + if(java_pac == NULL) return 0L; + jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile"); + if(java_forward_bwt == NULL) return 0L; + jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile"); + if(java_forward_sa == NULL) return 0L; + jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile"); + if(java_reverse_bwt == NULL) return 0L; + jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile"); + if(java_reverse_sa == NULL) return 0L; + + const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE); + if(env->ExceptionCheck()) return 0L; + + BWA* bwa = new BWA(ann_filename, + amb_filename, + pac_filename, + forward_bwt_filename, + forward_sa_filename, + reverse_bwt_filename, + reverse_sa_filename); + + Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration); + if(env->ExceptionCheck()) return 0L; + + env->ReleaseStringUTFChars(java_ann,ann_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_amb,amb_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_pac,pac_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename); + if(env->ExceptionCheck()) return 0L; + env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename); + if(env->ExceptionCheck()) return 0L; + + return (jlong)bwa; +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa) +{ + BWA* bwa = (BWA*)java_bwa; + delete bwa; +} + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) { + BWA* bwa = (BWA*)java_bwa; + set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty); + if(env->ExceptionCheck()) return; + set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty); + if(env->ExceptionCheck()) return; +} + +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) +{ + BWA* bwa = (BWA*)java_bwa; + + const jsize read_length = env->GetArrayLength(java_bases); + if(env->ExceptionCheck()) return NULL; + + jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); + if(read_bases == NULL) return NULL; + + bwt_aln1_t* paths = NULL; + unsigned num_paths = 0; + + unsigned best_path_count, second_best_path_count; + bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count); + + jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL); + if(java_paths == NULL) return NULL; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + bwt_aln1_t& path = *(paths + path_idx); + + jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"); + if(java_path_class == NULL) return NULL; + + jmethodID java_path_constructor = env->GetMethodID(java_path_class, "", "(IIIZJJIII)V"); + if(java_path_constructor == NULL) return NULL; + + // Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints. + jobject java_path = env->NewObject(java_path_class, + java_path_constructor, + path.n_mm, + path.n_gapo, + path.n_gape, + path.a, + (jlong)path.k, + (jlong)path.l, + path.score, + best_path_count, + second_best_path_count); + if(java_path == NULL) return NULL; + + env->SetObjectArrayElement(java_paths,path_idx,java_path); + if(env->ExceptionCheck()) return NULL; + + env->DeleteLocalRef(java_path_class); + if(env->ExceptionCheck()) return NULL; + } + + delete[] paths; + + env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); + + return env->ExceptionCheck() ? NULL : java_paths; +} + +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths) +{ + BWA* bwa = (BWA*)java_bwa; + + const jsize read_length = env->GetArrayLength(java_bases); + if(env->ExceptionCheck()) return NULL; + + jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); + if(read_bases == NULL) return NULL; + + const jsize num_paths = env->GetArrayLength(java_paths); + bwt_aln1_t* paths = new bwt_aln1_t[num_paths]; + unsigned best_count = 0, second_best_count = 0; + + for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) { + jobject java_path = env->GetObjectArrayElement(java_paths,path_idx); + jclass java_path_class = env->GetObjectClass(java_path); + if(java_path_class == NULL) return NULL; + + bwt_aln1_t& path = *(paths + path_idx); + + jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I"); + if(mismatches_field == NULL) return NULL; + path.n_mm = env->GetIntField(java_path,mismatches_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I"); + if(gap_opens_field == NULL) return NULL; + path.n_gapo = env->GetIntField(java_path,gap_opens_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I"); + if(gap_extensions_field == NULL) return NULL; + path.n_gape = env->GetIntField(java_path,gap_extensions_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z"); + if(negative_strand_field == NULL) return NULL; + path.a = env->GetBooleanField(java_path,negative_strand_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID k_field = env->GetFieldID(java_path_class, "k", "J"); + if(k_field == NULL) return NULL; + path.k = env->GetLongField(java_path,k_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID l_field = env->GetFieldID(java_path_class, "l", "J"); + if(l_field == NULL) return NULL; + path.l = env->GetLongField(java_path,l_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID score_field = env->GetFieldID(java_path_class, "score", "I"); + if(score_field == NULL) return NULL; + path.score = env->GetIntField(java_path,score_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I"); + if(best_count_field == NULL) return NULL; + best_count = env->GetIntField(java_path,best_count_field); + if(env->ExceptionCheck()) return NULL; + + jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I"); + if(second_best_count_field == NULL) return NULL; + second_best_count = env->GetIntField(java_path,second_best_count_field); + if(env->ExceptionCheck()) return NULL; + } + + Alignment* alignments = NULL; + unsigned num_alignments = 0; + bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments); + + jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL); + if(java_alignments == NULL) return NULL; + + for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) { + Alignment& alignment = *(alignments + alignment_idx); + jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment); + if(java_alignment == NULL) return NULL; + env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment); + if(env->ExceptionCheck()) return NULL; + } + + delete[] alignments; + delete[] paths; + + env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); + + return env->ExceptionCheck() ? NULL : java_alignments; +} + +JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) { + BWA* bwa = (BWA*)java_bwa; + + const jsize read_length = env->GetArrayLength(java_bases); + if(env->ExceptionCheck()) return NULL; + + jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE); + if(read_bases == NULL) return NULL; + + Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length); + jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL; + delete best_alignment; + + env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE); + + return java_best_alignment; +} + +static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) { + unsigned cigar_length; + if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0; + else if(!alignment.cigar) cigar_length = 1; + else cigar_length = alignment.n_cigar; + + jcharArray java_cigar_operators = env->NewCharArray(cigar_length); + if(java_cigar_operators == NULL) return NULL; + jintArray java_cigar_lengths = env->NewIntArray(cigar_length); + if(java_cigar_lengths == NULL) return NULL; + + if(alignment.cigar) { + for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) { + jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14]; + jint cigar_length = alignment.cigar[cigar_idx]&0x3fff; + + env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator); + if(env->ExceptionCheck()) return NULL; + env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length); + if(env->ExceptionCheck()) return NULL; + } + } + else { + if(alignment.type != BWA_TYPE_NO_MATCH) { + jchar cigar_operator = 'M'; + env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator); + if(env->ExceptionCheck()) return NULL; + env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length); + if(env->ExceptionCheck()) return NULL; + } + } + delete[] alignment.cigar; + + jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment"); + if(java_alignment_class == NULL) return NULL; + + jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "", "(IIZI[C[IILjava/lang/String;IIIII)V"); + if(java_alignment_constructor == NULL) return NULL; + + jstring java_md = env->NewStringUTF(alignment.md); + if(java_md == NULL) return NULL; + delete[] alignment.md; + + jobject java_alignment = env->NewObject(java_alignment_class, + java_alignment_constructor, + alignment.contig, + alignment.pos, + alignment.negative_strand, + alignment.mapping_quality, + java_cigar_operators, + java_cigar_lengths, + alignment.edit_distance, + java_md, + alignment.num_mismatches, + alignment.num_gap_opens, + alignment.num_gap_extensions, + alignment.num_best, + alignment.num_second_best); + if(java_alignment == NULL) return NULL; + + env->DeleteLocalRef(java_alignment_class); + if(env->ExceptionCheck()) return NULL; + + return java_alignment; +} + +static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) { + jclass configuration_class = env->GetObjectClass(configuration); + if(configuration_class == NULL) return NULL; + + jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;"); + if(configuration_field == NULL) return NULL; + + jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field); + + jclass file_class = env->FindClass("java/io/File"); + if(file_class == NULL) return NULL; + + jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;"); + if(path_extractor == NULL) return NULL; + + jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor); + if(path == NULL) return NULL; + + env->DeleteLocalRef(configuration_class); + env->DeleteLocalRef(file_class); + env->DeleteLocalRef(configuration_file); + + return path; +} + +static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) { + jclass configuration_class = env->GetObjectClass(configuration); + if(configuration_class == NULL) return; + + jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;"); + if(configuration_field == NULL) return; + + jobject boxed_value = env->GetObjectField(configuration,configuration_field); + if(env->ExceptionCheck()) return; + + if(boxed_value != NULL) { + jclass int_box_class = env->FindClass("java/lang/Integer"); + if(int_box_class == NULL) return; + + jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I"); + if(int_extractor == NULL) return; + + jint value = env->CallIntMethod(boxed_value,int_extractor); + if(env->ExceptionCheck()) return; + + if(value < 0) + { + throw_config_value_exception(env,field_name,"cannot be set to a negative value"); + return; + } + + (bwa->*setter)(value); + + env->DeleteLocalRef(int_box_class); + } + + env->DeleteLocalRef(boxed_value); + env->DeleteLocalRef(configuration_class); +} + +static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter) +{ + jclass configuration_class = env->GetObjectClass(configuration); + if(configuration_class == NULL) return; + + jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;"); + if(configuration_field == NULL) return; + + jobject boxed_value = env->GetObjectField(configuration,configuration_field); + if(boxed_value != NULL) { + jclass float_box_class = env->FindClass("java/lang/Float"); + if(float_box_class == NULL) return; + + jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F"); + if(float_extractor == NULL) return; + + jfloat value = env->CallFloatMethod(boxed_value,float_extractor); + if(env->ExceptionCheck()) return; + + if(value < 0) + { + throw_config_value_exception(env,field_name,"cannot be set to a negative value"); + return; + } + + (bwa->*setter)(value); + + env->DeleteLocalRef(float_box_class); + } + + env->DeleteLocalRef(boxed_value); + env->DeleteLocalRef(configuration_class); +} + +static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message) +{ + char* buffer = new char[strlen(field_name)+1+strlen(message)+1]; + sprintf(buffer,"%s %s",field_name,message); + jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException"); + if(sting_exception_class == NULL) return; + env->ThrowNew(sting_exception_class, buffer); + delete[] buffer; +} diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h new file mode 100644 index 000000000..0c44e430a --- /dev/null +++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h @@ -0,0 +1,61 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */ + +#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner +#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: create + * Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J + */ +JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create + (JNIEnv *, jobject, jobject, jobject); + +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: updateConfiguration + * Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration + (JNIEnv *, jobject, jlong, jobject); + +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: destroy + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy + (JNIEnv *, jobject, jlong); + +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: getPaths + * Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath; + */ +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths + (JNIEnv *, jobject, jlong, jbyteArray); + +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: convertPathsToAlignments + * Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment; + */ +JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments + (JNIEnv *, jobject, jlong, jbyteArray, jobjectArray); + +/* + * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner + * Method: getBestAlignment + * Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment; + */ +JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment + (JNIEnv *, jobject, jlong, jbyteArray); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/c/libenvironhack/Makefile b/public/c/libenvironhack/Makefile new file mode 100644 index 000000000..302ff8e31 --- /dev/null +++ b/public/c/libenvironhack/Makefile @@ -0,0 +1,10 @@ +CC=gcc +CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64 + +libenvironhack.dylib: libenvironhack.c + $(CC) $(CCFLAGS) -init _init_environ $< -o $@ + +all: libenvironhack.dylib + +clean: + rm -f libenvironhack.dylib diff --git a/public/c/libenvironhack/libenvironhack.c b/public/c/libenvironhack/libenvironhack.c new file mode 100644 index 000000000..8b2a2640e --- /dev/null +++ b/public/c/libenvironhack/libenvironhack.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* +LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+. +nm $LSF_LIBDIR/liblsf.dylib | grep environ +See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html +*/ + +#include + +char **environ = (char **)0; + +void init_environ(void) { + environ = (*_NSGetEnviron()); +} diff --git a/public/c/libenvironhack/libenvironhack.dylib b/public/c/libenvironhack/libenvironhack.dylib new file mode 100755 index 000000000..a45e038b4 Binary files /dev/null and b/public/c/libenvironhack/libenvironhack.dylib differ diff --git a/public/java/src/org/broadinstitute/sting/alignment/Aligner.java b/public/java/src/org/broadinstitute/sting/alignment/Aligner.java new file mode 100644 index 000000000..4bf05cb75 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/Aligner.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.alignment; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; + +/** + * Create perfect alignments from the read to the genome represented by the given BWT / suffix array. + * + * @author mhanna + * @version 0.1 + */ +public interface Aligner { + /** + * Close this instance of the BWA pointer and delete its resources. + */ + public void close(); + + /** + * Allow the aligner to choose one alignment randomly from the pile of best alignments. + * @param bases Bases to align. + * @return An align + */ + public Alignment getBestAlignment(final byte[] bases); + + /** + * Align the read to the reference. + * @param read Read to align. + * @param header Optional header to drop in place. + * @return A list of the alignments. + */ + public SAMRecord align(final SAMRecord read, final SAMFileHeader header); + + /** + * Get a iterator of alignments, batched by mapping quality. + * @param bases List of bases. + * @return Iterator to alignments. + */ + public Iterable getAllAlignments(final byte[] bases); + + /** + * Get a iterator of aligned reads, batched by mapping quality. + * @param read Read to align. + * @param newHeader Optional new header to use when aligning the read. If present, it must be null. + * @return Iterator to alignments. + */ + public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader); +} + + diff --git a/public/java/src/org/broadinstitute/sting/alignment/Alignment.java b/public/java/src/org/broadinstitute/sting/alignment/Alignment.java new file mode 100644 index 000000000..c63f5615f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/Alignment.java @@ -0,0 +1,221 @@ +package org.broadinstitute.sting.alignment; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Represents an alignment of a read to a site in the reference genome. + * + * @author mhanna + * @version 0.1 + */ +public class Alignment { + protected int contigIndex; + protected long alignmentStart; + protected boolean negativeStrand; + protected int mappingQuality; + + protected char[] cigarOperators; + protected int[] cigarLengths; + + protected int editDistance; + protected String mismatchingPositions; + + protected int numMismatches; + protected int numGapOpens; + protected int numGapExtensions; + protected int bestCount; + protected int secondBestCount; + + /** + * Gets the index of the given contig. + * @return the inde + */ + public int getContigIndex() { return contigIndex; } + + /** + * Gets the starting position for the given alignment. + * @return Starting position. + */ + public long getAlignmentStart() { return alignmentStart; } + + /** + * Is the given alignment on the reverse strand? + * @return True if the alignment is on the reverse strand. + */ + public boolean isNegativeStrand() { return negativeStrand; } + + /** + * Gets the score of this alignment. + * @return The score. + */ + public int getMappingQuality() { return mappingQuality; } + + /** + * Gets the edit distance; will eventually end up in the NM SAM tag + * if this alignment makes it that far. + * @return The edit distance. + */ + public int getEditDistance() { return editDistance; } + + /** + * A string representation of which positions mismatch; contents of MD tag. + * @return String representation of mismatching positions. + */ + public String getMismatchingPositions() { return mismatchingPositions; } + + /** + * Gets the number of mismatches in the read. + * @return Number of mismatches. + */ + public int getNumMismatches() { return numMismatches; } + + /** + * Get the number of gap opens. + * @return Number of gap opens. + */ + public int getNumGapOpens() { return numGapOpens; } + + /** + * Get the number of gap extensions. + * @return Number of gap extensions. + */ + public int getNumGapExtensions() { return numGapExtensions; } + + /** + * Get the number of best alignments. + * @return Number of top scoring alignments. + */ + public int getBestCount() { return bestCount; } + + /** + * Get the number of second best alignments. + * @return Number of second best scoring alignments. + */ + public int getSecondBestCount() { return secondBestCount; } + + /** + * Gets the cigar for this alignment. + * @return sam-jdk formatted alignment. + */ + public Cigar getCigar() { + Cigar cigar = new Cigar(); + for(int i = 0; i < cigarOperators.length; i++) { + CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]); + cigar.add(new CigarElement(cigarLengths[i],operator)); + } + return cigar; + } + + /** + * Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately + * package-protected. + * @return + */ + public String getCigarString() { + Cigar cigar = getCigar(); + if(cigar.isEmpty()) return "*"; + + StringBuilder cigarString = new StringBuilder(); + for(CigarElement element: cigar.getCigarElements()) { + cigarString.append(element.getLength()); + cigarString.append(element.getOperator()); + } + return cigarString.toString(); + } + + /** + * Stub for inheritance. + */ + public Alignment() {} + + /** + * Create a new alignment object. + * @param contigIndex The contig to which this read aligned. + * @param alignmentStart The point within the contig to which this read aligned. + * @param negativeStrand Forward or reverse alignment of the given read. + * @param mappingQuality How good does BWA think this mapping is? + * @param cigarOperators The ordered operators in the cigar string. + * @param cigarLengths The lengths to which each operator applies. + * @param editDistance The edit distance (cumulative) of the read. + * @param mismatchingPositions String representation of which bases in the read mismatch. + * @param numMismatches Number of total mismatches in the read. + * @param numGapOpens Number of gap opens in the read. + * @param numGapExtensions Number of gap extensions in the read. + * @param bestCount Number of best alignments in the read. + * @param secondBestCount Number of second best alignments in the read. + */ + public Alignment(int contigIndex, + int alignmentStart, + boolean negativeStrand, + int mappingQuality, + char[] cigarOperators, + int[] cigarLengths, + int editDistance, + String mismatchingPositions, + int numMismatches, + int numGapOpens, + int numGapExtensions, + int bestCount, + int secondBestCount) { + this.contigIndex = contigIndex; + this.alignmentStart = alignmentStart; + this.negativeStrand = negativeStrand; + this.mappingQuality = mappingQuality; + this.cigarOperators = cigarOperators; + this.cigarLengths = cigarLengths; + this.editDistance = editDistance; + this.mismatchingPositions = mismatchingPositions; + this.numMismatches = numMismatches; + this.numGapOpens = numGapOpens; + this.numGapExtensions = numGapExtensions; + this.bestCount = bestCount; + this.secondBestCount = secondBestCount; + } + + /** + * Creates a read directly from an alignment. + * @param alignment The alignment to convert to a read. + * @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags. + * @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence + * dictionary in the + * @return A mapped alignment. + */ + public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) { + SAMRecord read; + try { + read = (SAMRecord)unmappedRead.clone(); + } + catch(CloneNotSupportedException ex) { + throw new ReviewedStingException("Unable to create aligned read from template."); + } + + if(newSAMHeader != null) + read.setHeader(newSAMHeader); + + // If we're realigning a previously aligned record, strip out the placement of the alignment. + read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); + read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + + if(alignment != null) { + read.setReadUnmappedFlag(false); + read.setReferenceIndex(alignment.getContigIndex()); + read.setAlignmentStart((int)alignment.getAlignmentStart()); + read.setReadNegativeStrandFlag(alignment.isNegativeStrand()); + read.setMappingQuality(alignment.getMappingQuality()); + read.setCigar(alignment.getCigar()); + if(alignment.isNegativeStrand()) { + read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases())); + read.setBaseQualities(Utils.reverse(read.getBaseQualities())); + } + read.setAttribute("NM",alignment.getEditDistance()); + read.setAttribute("MD",alignment.getMismatchingPositions()); + } + + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java new file mode 100644 index 000000000..c6755e878 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.BWTFiles; +import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; + +/** + * Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them + * of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original + * alignment from the input file. + * + * @author mhanna + * @version 0.1 + */ +public class AlignmentValidationWalker extends ReadWalker { + /** + * The supporting BWT index generated using BWT. + */ + @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) + private String prefix = null; + + /** + * The instance used to generate alignments. + */ + private BWACAligner aligner = null; + + /** + * Create an aligner object. The aligner object will load and hold the BWT until close() is called. + */ + @Override + public void initialize() { + if(prefix == null) + prefix = getToolkit().getArguments().referenceFile.getAbsolutePath(); + BWTFiles bwtFiles = new BWTFiles(prefix); + BWAConfiguration configuration = new BWAConfiguration(); + aligner = new BWACAligner(bwtFiles,configuration); + } + + /** + * Aligns a read to the given reference. + * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. + * @param read Read to align. + * @return Number of reads aligned by this map (aka 1). + */ + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + //logger.info(String.format("examining read %s", read.getReadName())); + + byte[] bases = read.getReadBases(); + if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases); + + boolean matches = true; + Iterable alignments = aligner.getAllAlignments(bases); + Iterator alignmentIterator = alignments.iterator(); + + if(!alignmentIterator.hasNext()) { + matches = read.getReadUnmappedFlag(); + } + else { + Alignment[] alignmentsOfBestQuality = alignmentIterator.next(); + for(Alignment alignment: alignmentsOfBestQuality) { + matches = (alignment.getContigIndex() == read.getReferenceIndex()); + matches &= (alignment.getAlignmentStart() == read.getAlignmentStart()); + matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag()); + matches &= (alignment.getCigar().equals(read.getCigar())); + matches &= (alignment.getMappingQuality() == read.getMappingQuality()); + if(matches) break; + } + } + + if(!matches) { + logger.error("Found mismatch!"); + logger.error(String.format("Read %s:",read.getReadName())); + logger.error(String.format(" Contig index: %d",read.getReferenceIndex())); + logger.error(String.format(" Alignment start: %d", read.getAlignmentStart())); + logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag())); + logger.error(String.format(" Cigar: %s%n", read.getCigarString())); + logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality())); + for(Alignment[] alignmentsByScore: alignments) { + for(int i = 0; i < alignmentsByScore.length; i++) { + logger.error(String.format("Alignment %d:",i)); + logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex())); + logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart())); + logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand())); + logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString())); + logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality())); + } + } + throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName())); + } + + return 1; + } + + /** + * Initial value for reduce. In this case, validated reads will be counted. + * @return 0, indicating no reads yet validated. + */ + @Override + public Integer reduceInit() { return 0; } + + /** + * Calculates the number of reads processed. + * @param value Number of reads processed by this map. + * @param sum Number of reads processed before this map. + * @return Number of reads processed up to and including this map. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + /** + * Cleanup. + * @param result Number of reads processed. + */ + @Override + public void onTraversalDone(Integer result) { + aligner.close(); + super.onTraversalDone(result); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java new file mode 100644 index 000000000..7064e637f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment; + +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.BWTFiles; +import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.WalkerName; + +import java.io.File; + +/** + * Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format. + * Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation. + * + * @author mhanna + * @version 0.1 + */ +@WalkerName("Align") +public class AlignmentWalker extends ReadWalker { + @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " + + "generated by bwa index -d bwtsw. If unspecified, will default " + + "to the reference specified via the -R argument.",required=false) + private File targetReferenceFile = null; + + @Output + private StingSAMFileWriter out = null; + + /** + * The actual aligner. + */ + private BWACAligner aligner = null; + + /** + * New header to use, if desired. + */ + private SAMFileHeader header; + + /** + * Create an aligner object. The aligner object will load and hold the BWT until close() is called. + */ + @Override + public void initialize() { + if(targetReferenceFile == null) + targetReferenceFile = getToolkit().getArguments().referenceFile; + BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath()); + BWAConfiguration configuration = new BWAConfiguration(); + aligner = new BWACAligner(bwtFiles,configuration); + + // Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted. + header = getToolkit().getSAMFileHeader().clone(); + SAMSequenceDictionary referenceDictionary = + ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary(); + header.setSequenceDictionary(referenceDictionary); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + + out.writeHeader(header); + } + + /** + * Aligns a read to the given reference. + * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. + * @param read Read to align. + * @return Number of alignments found for this read. + */ + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + SAMRecord alignedRead = aligner.align(read,header); + out.addAlignment(alignedRead); + return 1; + } + + /** + * Initial value for reduce. In this case, alignments will be counted. + * @return 0, indicating no alignments yet found. + */ + @Override + public Integer reduceInit() { return 0; } + + /** + * Calculates the number of alignments found. + * @param value Number of alignments found by this map. + * @param sum Number of alignments found before this map. + * @return Number of alignments found up to and including this map. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + /** + * Cleanup. + * @param result Number of reads processed. + */ + @Override + public void onTraversalDone(Integer result) { + aligner.close(); + super.onTraversalDone(result); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java new file mode 100644 index 000000000..57d92319f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.BWTFiles; +import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; + +import java.io.PrintStream; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the + * frequency of that number of placements. + * + * @author mhanna + * @version 0.1 + */ +public class CountBestAlignmentsWalker extends ReadWalker { + /** + * The supporting BWT index generated using BWT. + */ + @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false) + private String prefix = null; + + @Output + private PrintStream out = null; + + /** + * The actual aligner. + */ + private Aligner aligner = null; + + private SortedMap alignmentFrequencies = new TreeMap(); + + /** + * Create an aligner object. The aligner object will load and hold the BWT until close() is called. + */ + @Override + public void initialize() { + if(prefix == null) + prefix = getToolkit().getArguments().referenceFile.getAbsolutePath(); + BWTFiles bwtFiles = new BWTFiles(prefix); + BWAConfiguration configuration = new BWAConfiguration(); + aligner = new BWACAligner(bwtFiles,configuration); + } + + /** + * Aligns a read to the given reference. + * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. + * @param read Read to align. + * @return Number of alignments found for this read. + */ + @Override + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator(); + if(alignmentIterator.hasNext()) { + int numAlignments = alignmentIterator.next().length; + if(alignmentFrequencies.containsKey(numAlignments)) + alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1); + else + alignmentFrequencies.put(numAlignments,1); + } + return 1; + } + + /** + * Initial value for reduce. In this case, validated reads will be counted. + * @return 0, indicating no reads yet validated. + */ + @Override + public Integer reduceInit() { return 0; } + + /** + * Calculates the number of reads processed. + * @param value Number of reads processed by this map. + * @param sum Number of reads processed before this map. + * @return Number of reads processed up to and including this map. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + /** + * Cleanup. + * @param result Number of reads processed. + */ + @Override + public void onTraversalDone(Integer result) { + aligner.close(); + for(Map.Entry alignmentFrequency: alignmentFrequencies.entrySet()) + out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue()); + super.onTraversalDone(result); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java new file mode 100644 index 000000000..ddbf784f5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -0,0 +1,38 @@ +package org.broadinstitute.sting.alignment.bwa; + +import org.broadinstitute.sting.alignment.Aligner; + +/** + * Align reads using BWA. + * + * @author mhanna + * @version 0.1 + */ +public abstract class BWAAligner implements Aligner { + /** + * The supporting files used by BWA. + */ + protected BWTFiles bwtFiles; + + /** + * The current configuration for the BWA aligner. + */ + protected BWAConfiguration configuration; + + /** + * Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct + * parameters. + * @param bwtFiles The many files representing BWTs persisted to disk. + * @param configuration Configuration parameters for the alignment. + */ + public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { + this.bwtFiles = bwtFiles; + this.configuration = configuration; + } + + /** + * Update the configuration passed to the BWA aligner. + * @param configuration New configuration to set. + */ + public abstract void updateConfiguration(BWAConfiguration configuration); +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java new file mode 100644 index 000000000..73441cb6a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.alignment.bwa; + +/** + * Configuration for the BWA/C aligner. + * + * @author mhanna + * @version 0.1 + */ +public class BWAConfiguration { + /** + * The maximum edit distance used by BWA. + */ + public Float maximumEditDistance = null; + + /** + * How many gap opens are acceptable within this alignment? + */ + public Integer maximumGapOpens = null; + + /** + * How many gap extensions are acceptable within this alignment? + */ + public Integer maximumGapExtensions = null; + + /** + * Do we disallow indels within a certain range from the start / end? + */ + public Integer disallowIndelWithinRange = null; + + /** + * What is the scoring penalty for a mismatch? + */ + public Integer mismatchPenalty = null; + + /** + * What is the scoring penalty for a gap open? + */ + public Integer gapOpenPenalty = null; + + /** + * What is the scoring penalty for a gap extension? + */ + public Integer gapExtensionPenalty = null; +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java new file mode 100644 index 000000000..a0589ac84 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java @@ -0,0 +1,234 @@ +package org.broadinstitute.sting.alignment.bwa; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.alignment.reference.bwt.*; +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.IOException; + +/** + * Support files for BWT. + * + * @author mhanna + * @version 0.1 + */ +public class BWTFiles { + /** + * ANN (?) file name. + */ + public final File annFile; + + /** + * AMB (?) file name. + */ + public final File ambFile; + + /** + * Packed reference sequence file. + */ + public final File pacFile; + + /** + * Reverse of packed reference sequence file. + */ + public final File rpacFile; + + /** + * Forward BWT file. + */ + public final File forwardBWTFile; + + /** + * Forward suffix array file. + */ + public final File forwardSAFile; + + /** + * Reverse BWT file. + */ + public final File reverseBWTFile; + + /** + * Reverse suffix array file. + */ + public final File reverseSAFile; + + /** + * Where these files autogenerated on the fly? + */ + public final boolean autogenerated; + + /** + * Create a new BWA configuration file using the given prefix. + * @param prefix Prefix to use when creating the configuration. Must not be null. + */ + public BWTFiles(String prefix) { + if(prefix == null) + throw new ReviewedStingException("Prefix must not be null."); + annFile = new File(prefix + ".ann"); + ambFile = new File(prefix + ".amb"); + pacFile = new File(prefix + ".pac"); + rpacFile = new File(prefix + ".rpac"); + forwardBWTFile = new File(prefix + ".bwt"); + forwardSAFile = new File(prefix + ".sa"); + reverseBWTFile = new File(prefix + ".rbwt"); + reverseSAFile = new File(prefix + ".rsa"); + autogenerated = false; + } + + /** + * Hand-create a new BWTFiles object, specifying a unique file object for each type. + * @param annFile ANN (alternate dictionary) file. + * @param ambFile AMB (holes) files. + * @param pacFile Packed representation of the forward reference sequence. + * @param forwardBWTFile BWT representation of the forward reference sequence. + * @param forwardSAFile SA representation of the forward reference sequence. + * @param rpacFile Packed representation of the reversed reference sequence. + * @param reverseBWTFile BWT representation of the reversed reference sequence. + * @param reverseSAFile SA representation of the reversed reference sequence. + */ + private BWTFiles(File annFile, + File ambFile, + File pacFile, + File forwardBWTFile, + File forwardSAFile, + File rpacFile, + File reverseBWTFile, + File reverseSAFile) { + this.annFile = annFile; + this.ambFile = ambFile; + this.pacFile = pacFile; + this.forwardBWTFile = forwardBWTFile; + this.forwardSAFile = forwardSAFile; + this.rpacFile = rpacFile; + this.reverseBWTFile = reverseBWTFile; + this.reverseSAFile = reverseSAFile; + autogenerated = true; + } + + /** + * Close out this files object, in the process deleting any temporary filse + * that were created. + */ + public void close() { + if(autogenerated) { + boolean success = true; + success = annFile.delete(); + success &= ambFile.delete(); + success &= pacFile.delete(); + success &= forwardBWTFile.delete(); + success &= forwardSAFile.delete(); + success &= rpacFile.delete(); + success &= reverseBWTFile.delete(); + success &= reverseSAFile.delete(); + + if(!success) + throw new ReviewedStingException("Unable to clean up autogenerated representation"); + } + } + + /** + * Create a new set of BWT files from the given reference sequence. + * @param referenceSequence Sequence from which to build metadata. + * @return A new object representing encoded representations of each sequence. + */ + public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) { + byte[] normalizedReferenceSequence = new byte[referenceSequence.length]; + System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length); + normalizeReferenceSequence(normalizedReferenceSequence); + + File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile; + try { + // Write the ann and amb for this reference sequence. + annFile = File.createTempFile("bwt",".ann"); + ambFile = File.createTempFile("bwt",".amb"); + + SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); + dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length)); + + ANNWriter annWriter = new ANNWriter(annFile); + annWriter.write(dictionary); + annWriter.close(); + + AMBWriter ambWriter = new AMBWriter(ambFile); + ambWriter.writeEmpty(dictionary); + ambWriter.close(); + + // Write the encoded files for the forward version of this reference sequence. + pacFile = File.createTempFile("bwt",".pac"); + bwtFile = File.createTempFile("bwt",".bwt"); + saFile = File.createTempFile("bwt",".sa"); + + writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile); + + // Write the encoded files for the reverse version of this reference sequence. + byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence); + + rpacFile = File.createTempFile("bwt",".rpac"); + rbwtFile = File.createTempFile("bwt",".rbwt"); + rsaFile = File.createTempFile("bwt",".rsa"); + + writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile); + } + catch(IOException ex) { + throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files"); + } + + // Make sure that, at the very least, all temporary files are deleted on exit. + annFile.deleteOnExit(); + ambFile.deleteOnExit(); + pacFile.deleteOnExit(); + bwtFile.deleteOnExit(); + saFile.deleteOnExit(); + rpacFile.deleteOnExit(); + rbwtFile.deleteOnExit(); + rsaFile.deleteOnExit(); + + return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile); + } + + /** + * Write the encoded form of the reference sequence. In the case of BWA, the encoded reference + * sequence is the reference itself in PAC format, the BWT, and the suffix array. + * @param referenceSequence The reference sequence to encode. + * @param pacFile Target for the PAC-encoded reference. + * @param bwtFile Target for the BWT representation of the reference. + * @param suffixArrayFile Target for the suffix array encoding of the reference. + * @throws java.io.IOException In case of issues writing to the file. + */ + private static void writeEncodedReferenceSequence(byte[] referenceSequence, + File pacFile, + File bwtFile, + File suffixArrayFile) throws IOException { + PackUtils.writeReferenceSequence(pacFile,referenceSequence); + + BWT bwt = BWT.createFromReferenceSequence(referenceSequence); + BWTWriter bwtWriter = new BWTWriter(bwtFile); + bwtWriter.write(bwt); + bwtWriter.close(); + + SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); + SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile); + suffixArrayWriter.write(suffixArray); + suffixArrayWriter.close(); + } + + /** + * Convert the given reference sequence into a form suitable for building into + * on-the-fly sequences. + * @param referenceSequence The reference sequence to normalize. + * @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated. + */ + private static void normalizeReferenceSequence(byte[] referenceSequence) { + StringUtil.toUpperCase(referenceSequence); + for(byte base: referenceSequence) { + if(base != 'A' && base != 'C' && base != 'G' && base != 'T') + throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base)); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java new file mode 100644 index 000000000..165314259 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java @@ -0,0 +1,259 @@ +package org.broadinstitute.sting.alignment.bwa.c; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.alignment.bwa.BWAAligner; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.BWTFiles; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * An aligner using the BWA/C implementation. + * + * @author mhanna + * @version 0.1 + */ +public class BWACAligner extends BWAAligner { + static { + System.loadLibrary("bwa"); + } + + /** + * A pointer to the C++ object representing the BWA engine. + */ + private long thunkPointer = 0; + + public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) { + super(bwtFiles,configuration); + if(thunkPointer != 0) + throw new ReviewedStingException("BWA/C attempting to reinitialize."); + + if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it."); + if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it."); + + thunkPointer = create(bwtFiles,configuration); + } + + /** + * Create an aligner object using an array of bytes as a reference. + * @param referenceSequence Reference sequence to encode ad-hoc. + * @param configuration Configuration for the given aligner. + */ + public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) { + this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration); + // Now that the temporary files are created, the temporary files can be destroyed. + bwtFiles.close(); + } + + /** + * Update the configuration passed to the BWA aligner. + * @param configuration New configuration to set. + */ + @Override + public void updateConfiguration(BWAConfiguration configuration) { + if(thunkPointer == 0) + throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner."); + updateConfiguration(thunkPointer,configuration); + } + + /** + * Close this instance of the BWA pointer and delete its resources. + */ + @Override + public void close() { + if(thunkPointer == 0) + throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized."); + destroy(thunkPointer); + } + + /** + * Allow the aligner to choose one alignment randomly from the pile of best alignments. + * @param bases Bases to align. + * @return An align + */ + @Override + public Alignment getBestAlignment(final byte[] bases) { + if(thunkPointer == 0) + throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized."); + return getBestAlignment(thunkPointer,bases); + } + + /** + * Get the best aligned read, chosen randomly from the pile of best alignments. + * @param read Read to align. + * @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid. + * @return Read with injected alignment data. + */ + @Override + public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) { + if(bwtFiles.autogenerated) + throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable"); + return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader); + } + + /** + * Get a iterator of alignments, batched by mapping quality. + * @param bases List of bases. + * @return Iterator to alignments. + */ + @Override + public Iterable getAllAlignments(final byte[] bases) { + final BWAPath[] paths = getPaths(bases); + return new Iterable() { + public Iterator iterator() { + return new Iterator() { + /** + * The last position accessed. + */ + private int position = 0; + + /** + * Whether all alignments have been seen based on the current position. + * @return True if any more alignments are pending. False otherwise. + */ + public boolean hasNext() { return position < paths.length; } + + /** + * Return the next cross-section of alignments, based on mapping quality. + * @return Array of the next set of alignments of a given mapping quality. + */ + public Alignment[] next() { + if(position >= paths.length) + throw new UnsupportedOperationException("Out of alignments to return."); + int score = paths[position].score; + int startingPosition = position; + while(position < paths.length && paths[position].score == score) position++; + return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position)); + } + + /** + * Unsupported. + */ + public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); } + }; + } + }; + } + + /** + * Get a iterator of aligned reads, batched by mapping quality. + * @param read Read to align. + * @param newHeader Optional new header to use when aligning the read. If present, it must be null. + * @return Iterator to alignments. + */ + @Override + public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) { + if(bwtFiles.autogenerated) + throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable"); + final Iterable alignments = getAllAlignments(read.getReadBases()); + return new Iterable() { + public Iterator iterator() { + final Iterator alignmentIterator = alignments.iterator(); + return new Iterator() { + /** + * Whether all alignments have been seen based on the current position. + * @return True if any more alignments are pending. False otherwise. + */ + public boolean hasNext() { return alignmentIterator.hasNext(); } + + /** + * Return the next cross-section of alignments, based on mapping quality. + * @return Array of the next set of alignments of a given mapping quality. + */ + public SAMRecord[] next() { + Alignment[] alignmentsOfQuality = alignmentIterator.next(); + SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length]; + for(int i = 0; i < alignmentsOfQuality.length; i++) { + reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader); + } + return reads; + } + + /** + * Unsupported. + */ + public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); } + }; + } + }; + } + + /** + * Get the paths associated with the given base string. + * @param bases List of bases. + * @return A set of paths through the BWA. + */ + public BWAPath[] getPaths(byte[] bases) { + if(thunkPointer == 0) + throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized."); + return getPaths(thunkPointer,bases); + } + + /** + * Create a pointer to the BWA/C thunk. + * @param files BWT source files. + * @param configuration Configuration of the aligner. + * @return Pointer to the BWA/C thunk. + */ + protected native long create(BWTFiles files, BWAConfiguration configuration); + + /** + * Update the configuration passed to the BWA aligner. For internal use only. + * @param thunkPointer pointer to BWA object. + * @param configuration New configuration to set. + */ + protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration); + + /** + * Destroy the BWA/C thunk. + * @param thunkPointer Pointer to the allocated thunk. + */ + protected native void destroy(long thunkPointer); + + /** + * Do the extra steps involved in converting a local alignment to a global alignment. + * @param bases ASCII representation of byte array. + * @param paths Paths through the current BWT. + * @return A list of alignments. + */ + protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) { + if(thunkPointer == 0) + throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized."); + return convertPathsToAlignments(thunkPointer,bases,paths); + } + + /** + * Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead. + * @param thunkPointer pointer to the C++ object managing BWA/C. + * @param bases ASCII representation of byte array. + * @return A list of paths through the specified BWT. + */ + protected native BWAPath[] getPaths(long thunkPointer, byte[] bases); + + /** + * Do the extra steps involved in converting a local alignment to a global alignment. + * Call this method's convertPathsToAlignments() wrapper (above) instead. + * @param thunkPointer pointer to the C++ object managing BWA/C. + * @param bases ASCII representation of byte array. + * @param paths Paths through the current BWT. + * @return A list of alignments. + */ + protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths); + + /** + * Gets the best alignment from BWA/C, randomly selected from all best-aligned reads. + * @param thunkPointer Pointer to BWA thunk. + * @param bases bases to align. + * @return The best alignment from BWA/C. + */ + protected native Alignment getBestAlignment(long thunkPointer, byte[] bases); +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java new file mode 100755 index 000000000..347d4344f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment.bwa.c; + +/** + * Models a BWA path. + * + * @author mhanna + * @version 0.1 + */ +public class BWAPath { + /** + * Number of mismatches encountered along this path. + */ + public final int numMismatches; + + /** + * Number of gap opens encountered along this path. + */ + public final int numGapOpens; + + /** + * Number of gap extensions along this path. + */ + public final int numGapExtensions; + + /** + * Whether this alignment was found on the positive or negative strand. + */ + public final boolean negativeStrand; + + /** + * Starting coordinate in the BWT. + */ + public final long k; + + /** + * Ending coordinate in the BWT. + */ + public final long l; + + /** + * The score of this path. + */ + public final int score; + + /** + * The number of best alignments seen along this path. + */ + public final int bestCount; + + /** + * The number of second best alignments seen along this path. + */ + public final int secondBestCount; + + /** + * Create a new path with the given attributes. + * @param numMismatches Number of mismatches along path. + * @param numGapOpens Number of gap opens along path. + * @param numGapExtensions Number of gap extensions along path. + * @param k Index to first coordinate within BWT. + * @param l Index to last coordinate within BWT. + * @param score Score of this alignment. Not the mapping quality. + */ + public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) { + this.numMismatches = numMismatches; + this.numGapOpens = numGapOpens; + this.numGapExtensions = numGapExtensions; + this.negativeStrand = negativeStrand; + this.k = k; + this.l = l; + this.score = score; + this.bestCount = bestCount; + this.secondBestCount = secondBestCount; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java new file mode 100644 index 000000000..2d568a96a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java @@ -0,0 +1,164 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.*; +import org.broadinstitute.sting.alignment.Aligner; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileNotFoundException; + +/** + * A test harness to ensure that the perfect aligner works. + * + * @author mhanna + * @version 0.1 + */ +public class AlignerTestHarness { + public static void main( String argv[] ) throws FileNotFoundException { + if( argv.length != 6 ) { + System.out.println("PerfectAlignerTestHarness "); + System.exit(1); + } + + File referenceFile = new File(argv[0]); + File bwtFile = new File(argv[1]); + File rbwtFile = new File(argv[2]); + File suffixArrayFile = new File(argv[3]); + File reverseSuffixArrayFile = new File(argv[4]); + File bamFile = new File(argv[5]); + + align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile); + } + + private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException { + Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile); + int count = 0; + + SAMFileReader reader = new SAMFileReader(bamFile); + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + int mismatches = 0; + int failures = 0; + + for(SAMRecord read: reader) { + count++; + if( count > 200000 ) break; + //if( count < 366000 ) continue; + //if( count > 2 ) break; + //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") ) + // continue; + //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") ) + // continue; + //if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") ) + // continue; + + SAMRecord alignmentCleaned = null; + try { + alignmentCleaned = (SAMRecord)read.clone(); + } + catch( CloneNotSupportedException ex ) { + throw new ReviewedStingException("SAMRecord clone not supported", ex); + } + + if( alignmentCleaned.getReadNegativeStrandFlag() ) + alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases())); + + alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); + alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); + + // Clear everything except flags pertaining to pairing and set 'unmapped' status to true. + alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C); + + Iterable alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases()); + if(!alignments.iterator().hasNext() ) { + //throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count)); + System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count); + failures++; + } + + Alignment foundAlignment = null; + for(Alignment[] alignmentsOfQuality: alignments) { + for(Alignment alignment: alignmentsOfQuality) { + if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() ) + continue; + if( read.getAlignmentStart() != alignment.getAlignmentStart() ) + continue; + + foundAlignment = alignment; + } + } + + if( foundAlignment != null ) { + //System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions()); + } + else { + System.out.printf("Error aligning read %s%n", read.getReadName()); + + mismatches++; + + IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + + System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION), + read.getAlignmentStart(), + read.getReadNegativeStrandFlag()); + int numDeletions = numDeletionsInCigar(read.getCigar()); + String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases()); + System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION)); + + for(Alignment[] alignmentsOfQuality: alignments) { + for(Alignment alignment: alignmentsOfQuality) { + System.out.println(); + + Cigar cigar = ((BWAAlignment)alignment).getCigar(); + + System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION)); + + int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION); + String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases()); + System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION), + alignment.getAlignmentStart(), + alignment.isNegativeStrand()); + } + } + + //throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count)); + } + + + if( count % 1000 == 0 ) + System.out.printf("%d reads examined.%n",count); + } + + System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures); + } + + private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) { + StringBuilder formatted = new StringBuilder(); + int readIndex = 0; + for(CigarElement cigarElement: cigar.getCigarElements()) { + if(cigarElement.getOperator() == toBlank) { + int number = cigarElement.getLength(); + while( number-- > 0 ) formatted.append(' '); + } + else { + int number = cigarElement.getLength(); + while( number-- > 0 ) formatted.append(bases.charAt(readIndex++)); + } + } + return formatted.toString(); + } + + private static int numDeletionsInCigar( Cigar cigar ) { + int numDeletions = 0; + for(CigarElement cigarElement: cigar.getCigarElements()) { + if(cigarElement.getOperator() == CigarOperator.DELETION) + numDeletions += cigarElement.getLength(); + } + return numDeletions; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java new file mode 100644 index 000000000..f1e3c31b6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java @@ -0,0 +1,150 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.Iterator; + +/** + * Represents a sequence of matches. + * + * @author mhanna + * @version 0.1 + */ +public class AlignmentMatchSequence implements Cloneable { + /** + * Stores the particular match entries in the order they occur. + */ + private Deque entries = new ArrayDeque(); + + /** + * Clone the given match sequence. + * @return A deep copy of the current match sequence. + */ + public AlignmentMatchSequence clone() { + AlignmentMatchSequence copy = null; + try { + copy = (AlignmentMatchSequence)super.clone(); + } + catch( CloneNotSupportedException ex ) { + throw new ReviewedStingException("Unable to clone AlignmentMatchSequence."); + } + + copy.entries = new ArrayDeque(); + for( AlignmentMatchSequenceEntry entry: entries ) + copy.entries.add(entry.clone()); + + return copy; + } + + public Cigar convertToCigar(boolean negativeStrand) { + Cigar cigar = new Cigar(); + Iterator iterator = negativeStrand ? entries.descendingIterator() : entries.iterator(); + while( iterator.hasNext() ) { + AlignmentMatchSequenceEntry entry = iterator.next(); + CigarOperator operator; + switch( entry.getAlignmentState() ) { + case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break; + case INSERTION: operator = CigarOperator.INSERTION; break; + case DELETION: operator = CigarOperator.DELETION; break; + default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState()); + } + cigar.add( new CigarElement(entry.count,operator) ); + } + return cigar; + } + + /** + * All a new alignment of the given state. + * @param state State to add to the sequence. + */ + public void addNext( AlignmentState state ) { + AlignmentMatchSequenceEntry last = entries.peekLast(); + // If the last entry is the same as this one, increment it. Otherwise, add a new entry. + if( last != null && last.alignmentState == state ) + last.increment(); + else + entries.add(new AlignmentMatchSequenceEntry(state)); + } + + /** + * Gets the current state of this alignment (what's the state of the last base?) + * @return State of the most recently aligned base. + */ + public AlignmentState getCurrentState() { + if( entries.size() == 0 ) + return AlignmentState.MATCH_MISMATCH; + return entries.peekLast().getAlignmentState(); + } + + /** + * How many bases in the read match the given state. + * @param state State to test. + * @return number of bases which match that state. + */ + public int getNumberOfBasesMatchingState(AlignmentState state) { + int matches = 0; + for( AlignmentMatchSequenceEntry entry: entries ) { + if( entry.getAlignmentState() == state ) + matches += entry.count; + } + return matches; + } + + /** + * Stores an individual match sequence entry. + */ + private class AlignmentMatchSequenceEntry implements Cloneable { + /** + * The state of the alignment throughout a given point in the sequence. + */ + private final AlignmentState alignmentState; + + /** + * The number of bases having this particular state. + */ + private int count; + + /** + * Create a new sequence entry with the given state. + * @param alignmentState The state that this sequence should contain. + */ + AlignmentMatchSequenceEntry( AlignmentState alignmentState ) { + this.alignmentState = alignmentState; + this.count = 1; + } + + /** + * Clone the given match sequence entry. + * @return A deep copy of the current match sequence entry. + */ + public AlignmentMatchSequenceEntry clone() { + try { + return (AlignmentMatchSequenceEntry)super.clone(); + } + catch( CloneNotSupportedException ex ) { + throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry."); + } + } + + /** + * Retrieves the current state of the alignment. + * @return The state of the current sequence. + */ + AlignmentState getAlignmentState() { + return alignmentState; + } + + /** + * Increment the count of alignments having this particular state. + */ + void increment() { + count++; + } + } +} + diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java new file mode 100644 index 000000000..92c603335 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java @@ -0,0 +1,13 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +/** + * The state of a given base in the alignment. + * + * @author mhanna + * @version 0.1 + */ +public enum AlignmentState { + MATCH_MISMATCH, + INSERTION, + DELETION +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java new file mode 100644 index 000000000..f3b515dba --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java @@ -0,0 +1,190 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +import net.sf.samtools.Cigar; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * An alignment object to be used incrementally as the BWA aligner + * inspects the read. + * + * @author mhanna + * @version 0.1 + */ +public class BWAAlignment extends Alignment implements Cloneable { + /** + * Track the number of alignments that have been created. + */ + private static long numCreated; + + /** + * Which number alignment is this? + */ + private long creationNumber; + + /** + * The aligner performing the alignments. + */ + protected BWAJavaAligner aligner; + + /** + * The sequence of matches/mismatches/insertions/deletions. + */ + private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence(); + + /** + * Working variable. How many bases have been matched at this point. + */ + protected int position; + + /** + * Working variable. How many mismatches have been encountered at this point. + */ + private int mismatches; + + /** + * Number of gap opens in alignment. + */ + private int gapOpens; + + /** + * Number of gap extensions in alignment. + */ + private int gapExtensions; + + /** + * Working variable. The lower bound of the alignment within the BWT. + */ + protected long loBound; + + /** + * Working variable. The upper bound of the alignment within the BWT. + */ + protected long hiBound; + + protected void setAlignmentStart(long position) { + this.alignmentStart = position; + } + + protected void setNegativeStrand(boolean negativeStrand) { + this.negativeStrand = negativeStrand; + } + + /** + * Cache the score. + */ + private int score; + + public Cigar getCigar() { + return alignmentMatchSequence.convertToCigar(isNegativeStrand()); + } + + /** + * Gets the current state of this alignment (state of the last base viewed).. + * @return Current state of the alignment. + */ + public AlignmentState getCurrentState() { + return alignmentMatchSequence.getCurrentState(); + } + + /** + * Adds the given state to the current alignment. + * @param state State to add to the given alignment. + */ + public void addState( AlignmentState state ) { + alignmentMatchSequence.addNext(state); + } + + /** + * Gets the BWA score of this alignment. + * @return BWA-style scores. 0 is best. + */ + public int getScore() { + return score; + } + + public int getMismatches() { return mismatches; } + public int getGapOpens() { return gapOpens; } + public int getGapExtensions() { return gapExtensions; } + + public void incrementMismatches() { + this.mismatches++; + updateScore(); + } + + public void incrementGapOpens() { + this.gapOpens++; + updateScore(); + } + + public void incrementGapExtensions() { + this.gapExtensions++; + updateScore(); + } + + /** + * Updates the score based on new information about matches / mismatches. + */ + private void updateScore() { + score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY; + } + + /** + * Create a new alignment with the given parent aligner. + * @param aligner Aligner being used. + */ + public BWAAlignment( BWAJavaAligner aligner ) { + this.aligner = aligner; + this.creationNumber = numCreated++; + } + + /** + * Clone the alignment. + * @return New instance of the alignment. + */ + public BWAAlignment clone() { + BWAAlignment newAlignment = null; + try { + newAlignment = (BWAAlignment)super.clone(); + } + catch( CloneNotSupportedException ex ) { + throw new ReviewedStingException("Unable to clone BWAAlignment."); + } + newAlignment.creationNumber = numCreated++; + newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone(); + + return newAlignment; + } + + /** + * How many bases in the read match the given state. + * @param state State to test. + * @return number of bases which match that state. + */ + public int getNumberOfBasesMatchingState(AlignmentState state) { + return alignmentMatchSequence.getNumberOfBasesMatchingState(state); + } + + /** + * Compare this alignment to another alignment. + * @param rhs Other alignment to which to compare. + * @return < 0 if this < other, == 0 if this == other, > 0 if this > other + */ + public int compareTo(Alignment rhs) { + BWAAlignment other = (BWAAlignment)rhs; + + // If the scores are different, disambiguate using the score. + if(score != other.score) + return score > other.score ? 1 : -1; + + // Otherwise, use the order in which the elements were created. + if(creationNumber != other.creationNumber) + return creationNumber > other.creationNumber ? -1 : 1; + + return 0; + } + + public String toString() { + return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java new file mode 100644 index 000000000..fbeac9192 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java @@ -0,0 +1,393 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.alignment.bwa.BWAAligner; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.reference.bwt.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.PriorityQueue; + +/** + * Create imperfect alignments from the read to the genome represented by the given BWT / suffix array. + * + * @author mhanna + * @version 0.1 + */ +public class BWAJavaAligner extends BWAAligner { + /** + * BWT in the forward direction. + */ + private BWT forwardBWT; + + /** + * BWT in the reverse direction. + */ + private BWT reverseBWT; + + /** + * Suffix array in the forward direction. + */ + private SuffixArray forwardSuffixArray; + + /** + * Suffix array in the reverse direction. + */ + private SuffixArray reverseSuffixArray; + + /** + * Maximum edit distance (-n option from original BWA). + */ + private final int MAXIMUM_EDIT_DISTANCE = 4; + + /** + * Maximum number of gap opens (-o option from original BWA). + */ + private final int MAXIMUM_GAP_OPENS = 1; + + /** + * Maximum number of gap extensions (-e option from original BWA). + */ + private final int MAXIMUM_GAP_EXTENSIONS = 6; + + /** + * Penalty for straight mismatches (-M option from original BWA). + */ + public final int MISMATCH_PENALTY = 3; + + /** + * Penalty for gap opens (-O option from original BWA). + */ + public final int GAP_OPEN_PENALTY = 11; + + /** + * Penalty for gap extensions (-E option from original BWA). + */ + public final int GAP_EXTENSION_PENALTY = 4; + + /** + * Skip the ends of indels. + */ + public final int INDEL_END_SKIP = 5; + + public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) { + super(null,null); + forwardBWT = new BWTReader(forwardBWTFile).read(); + reverseBWT = new BWTReader(reverseBWTFile).read(); + forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read(); + reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read(); + } + + /** + * Close this instance of the BWA pointer and delete its resources. + */ + @Override + public void close() { + throw new UnsupportedOperationException("BWA aligner can't currently be closed."); + } + + /** + * Update the current parameters of this aligner. + * @param configuration New configuration to set. + */ + public void updateConfiguration(BWAConfiguration configuration) { + throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed."); + } + + /** + * Allow the aligner to choose one alignment randomly from the pile of best alignments. + * @param bases Bases to align. + * @return An align + */ + public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } + + /** + * Align the read to the reference. + * @param read Read to align. + * @param header Optional header to drop in place. + * @return A list of the alignments. + */ + public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } + + /** + * Get a iterator of alignments, batched by mapping quality. + * @param bases List of bases. + * @return Iterator to alignments. + */ + public Iterable getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } + + /** + * Get a iterator of aligned reads, batched by mapping quality. + * @param read Read to align. + * @param newHeader Optional new header to use when aligning the read. If present, it must be null. + * @return Iterator to alignments. + */ + public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); } + + + public List align( SAMRecord read ) { + List successfulMatches = new ArrayList(); + + Byte[] uncomplementedBases = normalizeBases(read.getReadBases()); + Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases()))); + + List forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT); + List reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT); + + // Seed the best score with any score that won't overflow on comparison. + int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY; + int bestDiff = MAXIMUM_EDIT_DISTANCE+1; + int maxDiff = MAXIMUM_EDIT_DISTANCE; + + PriorityQueue alignments = new PriorityQueue(); + + // Create a fictional initial alignment, with the position just off the end of the read, and the limits + // set as the entire BWT. + alignments.add(createSeedAlignment(reverseBWT)); + alignments.add(createSeedAlignment(forwardBWT)); + + while(!alignments.isEmpty()) { + BWAAlignment alignment = alignments.remove(); + + // From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on. + if( alignment.getScore() > bestScore + MISMATCH_PENALTY ) + break; + + Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases; + BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT; + List lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds; + + // if z < D(i) then return {} + int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions(); + if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value ) + continue; + + if(mismatches == 0) { + exactMatch(alignment,bases,bwt); + if(alignment.loBound > alignment.hiBound) + continue; + } + + // Found a valid alignment; store it and move on. + if(alignment.position >= read.getReadLength()-1) { + for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) { + BWAAlignment finalAlignment = alignment.clone(); + + if( finalAlignment.isNegativeStrand() ) + finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1); + else { + int sizeAlongReference = read.getReadLength() - + finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) + + finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION); + finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1); + } + + successfulMatches.add(finalAlignment); + + bestScore = Math.min(finalAlignment.getScore(),bestScore); + bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff); + maxDiff = bestDiff + 1; + } + + continue; + } + + //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : ""); + /* + System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(), + alignment.negativeStrand?1:0, + bases.length-alignment.position-1, + alignment.getCurrentState().toString().charAt(0), + alignment.getMismatches(), + alignment.getGapOpens(), + alignment.getGapExtensions(), + lowerBounds.get(alignment.position+1).value, + lowerBounds.get(alignment.position+1).width, + alignment.loBound, + alignment.hiBound); + */ + + // Temporary -- look ahead to see if the next alignment is bounded. + boolean allowDifferences = mismatches > 0; + boolean allowMismatches = mismatches > 0; + + if( allowDifferences && + alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() && + read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) { + if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) { + if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) { + // Add a potential insertion extension. + BWAAlignment insertionAlignment = createInsertionAlignment(alignment); + insertionAlignment.incrementGapOpens(); + alignments.add(insertionAlignment); + + // Add a potential deletion by marking a deletion and augmenting the position. + List deletionAlignments = createDeletionAlignments(bwt,alignment); + for( BWAAlignment deletionAlignment: deletionAlignments ) + deletionAlignment.incrementGapOpens(); + alignments.addAll(deletionAlignments); + } + } + else if( alignment.getCurrentState() == AlignmentState.INSERTION ) { + if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) { + // Add a potential insertion extension. + BWAAlignment insertionAlignment = createInsertionAlignment(alignment); + insertionAlignment.incrementGapExtensions(); + alignments.add(insertionAlignment); + } + } + else if( alignment.getCurrentState() == AlignmentState.DELETION ) { + if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) { + // Add a potential deletion by marking a deletion and augmenting the position. + List deletionAlignments = createDeletionAlignments(bwt,alignment); + for( BWAAlignment deletionAlignment: deletionAlignments ) + deletionAlignment.incrementGapExtensions(); + alignments.addAll(deletionAlignments); + } + } + } + + // Mismatches + alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches)); + } + + return successfulMatches; + } + + /** + * Create an seeding alignment to use as a starting point when traversing. + * @param bwt source BWT. + * @return Seed alignment. + */ + private BWAAlignment createSeedAlignment(BWT bwt) { + BWAAlignment seed = new BWAAlignment(this); + seed.setNegativeStrand(bwt == forwardBWT); + seed.position = -1; + seed.loBound = 0; + seed.hiBound = bwt.length(); + return seed; + } + + /** + * Creates a new alignments representing direct matches / mismatches. + * @param bwt Source BWT with which to work. + * @param alignment Alignment for the previous position. + * @param bases The bases in the read. + * @param allowMismatch Should mismatching bases be allowed? + * @return New alignment representing this position if valid; null otherwise. + */ + private List createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) { + List newAlignments = new ArrayList(); + + List baseChoices = new ArrayList(); + Byte thisBase = bases[alignment.position+1]; + + if( allowMismatch ) + baseChoices.addAll(Bases.allOf()); + else + baseChoices.add(thisBase); + + if( thisBase != null ) { + // Keep rotating the current base to the last position until we've hit the current base. + for( ;; ) { + baseChoices.add(baseChoices.remove(0)); + if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) ) + break; + + } + } + + for(byte base: baseChoices) { + BWAAlignment newAlignment = alignment.clone(); + + newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; + newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); + + // If this alignment is valid, skip it. + if( newAlignment.loBound > newAlignment.hiBound ) + continue; + + newAlignment.position++; + newAlignment.addState(AlignmentState.MATCH_MISMATCH); + if( bases[newAlignment.position] == null || base != bases[newAlignment.position] ) + newAlignment.incrementMismatches(); + + newAlignments.add(newAlignment); + } + + return newAlignments; + } + + /** + * Create a new alignment representing an insertion at this point in the read. + * @param alignment Alignment from which to derive the insertion. + * @return New alignment reflecting the insertion. + */ + private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) { + // Add a potential insertion extension. + BWAAlignment newAlignment = alignment.clone(); + newAlignment.position++; + newAlignment.addState(AlignmentState.INSERTION); + return newAlignment; + } + + /** + * Create new alignments representing a deletion at this point in the read. + * @param bwt source BWT for inferring deletion info. + * @param alignment Alignment from which to derive the deletion. + * @return New alignments reflecting all possible deletions. + */ + private List createDeletionAlignments( BWT bwt, BWAAlignment alignment) { + List newAlignments = new ArrayList(); + for(byte base: Bases.instance) { + BWAAlignment newAlignment = alignment.clone(); + + newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; + newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); + + // If this alignment is valid, skip it. + if( newAlignment.loBound > newAlignment.hiBound ) + continue; + + newAlignment.addState(AlignmentState.DELETION); + + newAlignments.add(newAlignment); + } + + return newAlignments; + } + + /** + * Exactly match the given alignment against the given BWT. + * @param alignment Alignment to match. + * @param bases Bases to use. + * @param bwt BWT to use. + */ + private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) { + while( ++alignment.position < bases.length ) { + byte base = bases[alignment.position]; + alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; + alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); + if( alignment.loBound > alignment.hiBound ) + return; + } + } + + /** + * Make each base into A/C/G/T or null if unknown. + * @param bases Base string to normalize. + * @return Array of normalized bases. + */ + private Byte[] normalizeBases( byte[] bases ) { + Byte[] normalBases = new Byte[bases.length]; + for(int i = 0; i < bases.length; i++) + normalBases[i] = Bases.fromASCII(bases[i]); + return normalBases; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java new file mode 100644 index 000000000..be7514255 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java @@ -0,0 +1,88 @@ +package org.broadinstitute.sting.alignment.bwa.java; + +import org.broadinstitute.sting.alignment.reference.bwt.BWT; + +import java.util.ArrayList; +import java.util.List; + +/** + * At any point along the given read, what is a good lower bound for the + * total number of differences? + * + * @author mhanna + * @version 0.1 + */ +public class LowerBound { + /** + * Lower bound of the suffix array. + */ + public final long loIndex; + + /** + * Upper bound of the suffix array. + */ + public final long hiIndex; + + /** + * Width of the bwt from loIndex -> hiIndex, inclusive. + */ + public final long width; + + /** + * The lower bound at the given point. + */ + public final int value; + + /** + * Create a new lower bound with the given value. + * @param loIndex The lower bound of the BWT. + * @param hiIndex The upper bound of the BWT. + * @param value Value for the lower bound at this site. + */ + private LowerBound(long loIndex, long hiIndex, int value) { + this.loIndex = loIndex; + this.hiIndex = hiIndex; + this.width = hiIndex - loIndex + 1; + this.value = value; + } + + /** + * Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper. + * @param bases Bases of the read to use when creating a new BWT. + * @param bwt BWT to check against. + * @return A list of lower bounds at every point in the reference. + * + */ + public static List create(Byte[] bases, BWT bwt) { + List bounds = new ArrayList(); + + long loIndex = 0, hiIndex = bwt.length(); + int mismatches = 0; + for( int i = bases.length-1; i >= 0; i-- ) { + Byte base = bases[i]; + + // Ignore non-ACGT bases. + if( base != null ) { + loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1; + hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex); + } + + if( base == null || loIndex > hiIndex ) { + loIndex = 0; + hiIndex = bwt.length(); + mismatches++; + } + bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches)); + } + + return bounds; + } + + /** + * Create a string representation of this bound. + * @return String version of this bound. + */ + public String toString() { + return String.format("LowerBound: w = %d, value = %d",width,value); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/package-info.java b/public/java/src/org/broadinstitute/sting/alignment/package-info.java new file mode 100644 index 000000000..60cf1e425 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/package-info.java @@ -0,0 +1,4 @@ +/** + * Analyses used to validate the correctness and performance the BWA Java bindings. + */ +package org.broadinstitute.sting.alignment; \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java new file mode 100644 index 000000000..ec10415dd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; + +/** + * Writes .amb files - a file indicating where 'holes' (indeterminant bases) + * exist in the contig. Currently, only empty, placeholder AMBs are supported. + * + * @author mhanna + * @version 0.1 + */ +public class AMBWriter { + /** + * Number of holes is fixed at zero. + */ + private static final int NUM_HOLES = 0; + + /** + * Input stream from which to read BWT data. + */ + private final PrintStream out; + + /** + * Create a new ANNWriter targeting the given file. + * @param file file into which ANN data should be written. + * @throws java.io.IOException if there is a problem opening the output file. + */ + public AMBWriter(File file) throws IOException { + out = new PrintStream(file); + } + + /** + * Create a new ANNWriter targeting the given OutputStream. + * @param stream Stream into which ANN data should be written. + */ + public AMBWriter(OutputStream stream) { + out = new PrintStream(stream); + } + + /** + * Write the contents of the given dictionary into the AMB file. + * Assumes that there are no holes in the dictionary. + * @param dictionary Dictionary to write. + */ + public void writeEmpty(SAMSequenceDictionary dictionary) { + long genomeLength = 0L; + for(SAMSequenceRecord sequence: dictionary.getSequences()) + genomeLength += sequence.getSequenceLength(); + + int sequences = dictionary.getSequences().size(); + + // Write the header + out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES); + } + + /** + * Close the given output stream. + */ + public void close() { + out.close(); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java new file mode 100644 index 000000000..8d692a9e7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java @@ -0,0 +1,95 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; + +/** + * Writes .ann files - an alternate sequence dictionary format + * used by BWA/C. For best results, the input sequence dictionary + * should be created with Picard's CreateSequenceDictionary.jar, + * TRUNCATE_NAMES_AT_WHITESPACE=false. + * + * @author mhanna + * @version 0.1 + */ +public class ANNWriter { + /** + * BWA uses a fixed seed of 11, written into every file. + */ + private static final int BNS_SEED = 11; + + /** + * A seemingly unused value that appears in every contig in the ANN. + */ + private static final int GI = 0; + + /** + * Input stream from which to read BWT data. + */ + private final PrintStream out; + + /** + * Create a new ANNWriter targeting the given file. + * @param file file into which ANN data should be written. + * @throws IOException if there is a problem opening the output file. + */ + public ANNWriter(File file) throws IOException { + out = new PrintStream(file); + } + + /** + * Create a new ANNWriter targeting the given OutputStream. + * @param stream Stream into which ANN data should be written. + */ + public ANNWriter(OutputStream stream) { + out = new PrintStream(stream); + } + + /** + * Write the contents of the given dictionary into the ANN file. + * Assumes that no ambs (blocks of indeterminate base) are present in the dictionary. + * @param dictionary Dictionary to write. + */ + public void write(SAMSequenceDictionary dictionary) { + long genomeLength = 0L; + for(SAMSequenceRecord sequence: dictionary.getSequences()) + genomeLength += sequence.getSequenceLength(); + + int sequences = dictionary.getSequences().size(); + + // Write the header + out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED); + + for(SAMSequenceRecord sequence: dictionary.getSequences()) { + String fullSequenceName = sequence.getSequenceName(); + String trimmedSequenceName = fullSequenceName; + String sequenceComment = "(null)"; + + long offset = 0; + + // Separate the sequence name from the sequence comment, based on BWA's definition. + // BWA's definition appears to accept a zero-length contig name, so mimic that behavior. + if(fullSequenceName.indexOf(' ') >= 0) { + trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' ')); + sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1); + } + + // Write the sequence GI (?), name, and comment. + out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment); + // Write the sequence offset, length, and ambs (currently fixed at 0). + out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0); + } + } + + /** + * Close the given output stream. + */ + public void close() { + out.close(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java new file mode 100644 index 000000000..7f8c48253 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java @@ -0,0 +1,172 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * Represents the Burrows-Wheeler Transform of a reference sequence. + * + * @author mhanna + * @version 0.1 + */ +public class BWT { + /** + * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases. + * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0 + */ + public static final int SEQUENCE_BLOCK_SIZE = 128; + + /** + * The inverse SA, used as a placeholder for determining where the special EOL character sits. + */ + protected final long inverseSA0; + + /** + * Cumulative counts for the entire BWT. + */ + protected final Counts counts; + + /** + * The individual sequence blocks, modelling how they appear on disk. + */ + protected final SequenceBlock[] sequenceBlocks; + + /** + * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII). + * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. + * @param counts Cumulative count of bases, in A,C,G,T order. + * @param sequenceBlocks The full BWT sequence, sans the '$'. + */ + public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) { + this.inverseSA0 = inverseSA0; + this.counts = counts; + this.sequenceBlocks = sequenceBlocks; + } + + /** + * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII). + * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. + * @param counts Count of bases, in A,C,G,T order. + * @param sequence The full BWT sequence, sans the '$'. + */ + public BWT( long inverseSA0, Counts counts, byte[] sequence ) { + this(inverseSA0,counts,generateSequenceBlocks(sequence)); + } + + /** + * Extract the full sequence from the list of block. + * @return The full BWT string as a byte array. + */ + public byte[] getSequence() { + byte[] sequence = new byte[(int)counts.getTotal()]; + for( SequenceBlock block: sequenceBlocks ) + System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength); + return sequence; + } + + /** + * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. + * @param base The base. + * @return Total counts for all bases lexicographically smaller than this base. + */ + public long counts(byte base) { + return counts.getCumulative(base); + } + + /** + * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. + * @param base The base. + * @param index The position to search within the BWT. + * @return Total counts for all bases lexicographically smaller than this base. + */ + public long occurrences(byte base,long index) { + SequenceBlock block = getSequenceBlock(index); + int position = getSequencePosition(index); + long accumulator = block.occurrences.get(base); + for(int i = 0; i <= position; i++) { + if(base == block.sequence[i]) + accumulator++; + } + return accumulator; + } + + /** + * The number of bases in the BWT as a whole. + * @return Number of bases. + */ + public long length() { + return counts.getTotal(); + } + + /** + * Create a new BWT from the given reference sequence. + * @param referenceSequence Sequence from which to derive the BWT. + * @return reference sequence-derived BWT. + */ + public static BWT createFromReferenceSequence(byte[] referenceSequence) { + SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence); + + byte[] bwt = new byte[(int)suffixArray.length()-1]; + int bwtIndex = 0; + for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) { + if(suffixArray.get(suffixArrayIndex) == 0) + continue; + bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1]; + } + + return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt); + } + + /** + * Gets the base at a given position in the BWT. + * @param index The index to use. + * @return The base at that location. + */ + protected byte getBase(long index) { + if(index == inverseSA0) + throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index)); + + SequenceBlock block = getSequenceBlock(index); + int position = getSequencePosition(index); + return block.sequence[position]; + } + + private SequenceBlock getSequenceBlock(long index) { + // If the index is above the SA-1[0], remap it to the appropriate coordinate space. + if(index > inverseSA0) index--; + return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)]; + } + + private int getSequencePosition(long index) { + // If the index is above the SA-1[0], remap it to the appropriate coordinate space. + if(index > inverseSA0) index--; + return (int)(index%SEQUENCE_BLOCK_SIZE); + } + + /** + * Create a set of sequence blocks from one long sequence. + * @param sequence Sequence from which to derive blocks. + * @return Array of sequence blocks containing data from the sequence. + */ + private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) { + Counts occurrences = new Counts(); + + int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE); + SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks]; + + for( int block = 0; block < numSequenceBlocks; block++ ) { + int blockStart = block*SEQUENCE_BLOCK_SIZE; + int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart); + byte[] subsequence = new byte[blockLength]; + + System.arraycopy(sequence,blockStart,subsequence,0,blockLength); + + sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence); + + for( byte base: subsequence ) + occurrences.increment(base); + } + + return sequenceBlocks; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java new file mode 100644 index 000000000..5c4f6d39d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java @@ -0,0 +1,89 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream; +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteOrder; +/** + * Reads a BWT from a given file. + * + * @author mhanna + * @version 0.1 + */ +public class BWTReader { + /** + * Input stream from which to read BWT data. + */ + private FileInputStream inputStream; + + /** + * Create a new BWT reader. + * @param inputFile File in which the BWT is stored. + */ + public BWTReader( File inputFile ) { + try { + this.inputStream = new FileInputStream(inputFile); + } + catch( FileNotFoundException ex ) { + throw new ReviewedStingException("Unable to open input file", ex); + } + } + + /** + * Read a BWT from the input stream. + * @return The BWT stored in the input stream. + */ + public BWT read() { + UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); + BasePackedInputStream basePackedInputStream = new BasePackedInputStream(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN); + + long inverseSA0; + long[] count; + SequenceBlock[] sequenceBlocks; + + try { + inverseSA0 = uintPackedInputStream.read(); + count = new long[PackUtils.ALPHABET_SIZE]; + uintPackedInputStream.read(count); + + long bwtSize = count[PackUtils.ALPHABET_SIZE-1]; + sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)]; + + for( int block = 0; block < sequenceBlocks.length; block++ ) { + int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE; + int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart); + + long[] occurrences = new long[PackUtils.ALPHABET_SIZE]; + byte[] bwt = new byte[sequenceLength]; + + uintPackedInputStream.read(occurrences); + basePackedInputStream.read(bwt); + + sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt); + } + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to read BWT from input stream.", ex); + } + + return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks); + } + + /** + * Close the input stream. + */ + public void close() { + try { + inputStream.close(); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to close input file", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java new file mode 100644 index 000000000..3370f79c8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.samtools.SAMSequenceDictionary; + +import java.io.File; +import java.io.IOException; + +/** + * Generate BWA supplementary files (.ann, .amb) from the command line. + * + * @author mhanna + * @version 0.1 + */ +public class BWTSupplementaryFileGenerator { + enum SupplementaryFileType { ANN, AMB } + + public static void main(String[] args) throws IOException { + if(args.length < 3) + usage("Incorrect number of arguments supplied"); + + File fastaFile = new File(args[0]); + File outputFile = new File(args[1]); + SupplementaryFileType outputType = null; + try { + outputType = Enum.valueOf(SupplementaryFileType.class,args[2]); + } + catch(IllegalArgumentException ex) { + usage("Invalid output type: " + args[2]); + } + + ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile); + SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary(); + + switch(outputType) { + case ANN: + ANNWriter annWriter = new ANNWriter(outputFile); + annWriter.write(dictionary); + annWriter.close(); + break; + case AMB: + AMBWriter ambWriter = new AMBWriter(outputFile); + ambWriter.writeEmpty(dictionary); + ambWriter.close(); + break; + default: + usage("Unsupported output type: " + outputType); + } + } + + /** + * Print usage information and exit. + */ + private static void usage(String message) { + System.err.println(message); + System.err.println("Usage: BWTSupplementaryFileGenerator "); + System.exit(1); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java new file mode 100644 index 000000000..a813cdc9a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTWriter.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.alignment.reference.packing.BasePackedOutputStream; +import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * Writes an in-memory BWT to an outputstream. + * + * @author mhanna + * @version 0.1 + */ +public class BWTWriter { + /** + * Input stream from which to read BWT data. + */ + private final OutputStream outputStream; + + /** + * Create a new BWT writer. + * @param outputFile File in which the BWT is stored. + */ + public BWTWriter( File outputFile ) { + try { + this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); + } + catch( FileNotFoundException ex ) { + throw new ReviewedStingException("Unable to open output file", ex); + } + } + + /** + * Write a BWT to the output stream. + * @param bwt Transform to be written to the output stream. + */ + public void write( BWT bwt ) { + UnsignedIntPackedOutputStream intPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); + BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN); + + try { + intPackedOutputStream.write(bwt.inverseSA0); + intPackedOutputStream.write(bwt.counts.toArray(true)); + + for( SequenceBlock block: bwt.sequenceBlocks ) { + intPackedOutputStream.write(block.occurrences.toArray(false)); + basePackedOutputStream.write(block.sequence); + } + + // The last block is the last set of counts in the structure. + intPackedOutputStream.write(bwt.counts.toArray(false)); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to read BWT from input stream.", ex); + } + } + + /** + * Close the input stream. + */ + public void close() { + try { + outputStream.close(); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to close input file", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java new file mode 100644 index 000000000..bc0a5b63d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Bases.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Enhanced enum representation of a base. + * + * @author mhanna + * @version 0.1 + */ +public class Bases implements Iterable +{ + public static byte A = 'A'; + public static byte C = 'C'; + public static byte G = 'G'; + public static byte T = 'T'; + + public static final Bases instance = new Bases(); + + private static final List allBases; + + /** + * Representation of the base broken down by packed value. + */ + private static final Map basesByPack = new HashMap(); + + static { + List bases = new ArrayList(); + bases.add(A); + bases.add(C); + bases.add(G); + bases.add(T); + allBases = Collections.unmodifiableList(bases); + + for(int i = 0; i < allBases.size(); i++) + basesByPack.put(i,allBases.get(i)); + } + + /** + * Create a new base with the given ascii representation and + * pack value. + */ + private Bases() { + } + + /** + * Return all possible bases. + * @return Byte representation of all bases. + */ + public static Collection allOf() { + return allBases; + } + + /** + * Gets the number of known bases. + * @return The number of known bases. + */ + public static int size() { + return allBases.size(); + } + + /** + * Gets an iterator over the total number of known base types. + * @return Iterator over all known bases. + */ + public Iterator iterator() { + return basesByPack.values().iterator(); + } + + /** + * Get the given base from the packed representation. + * @param pack Packed representation. + * @return base. + */ + public static byte fromPack( int pack ) { return basesByPack.get(pack); } + + /** + * Convert the given base to its packed value. + * @param ascii ASCII representation of the base. + * @return Packed value. + */ + public static int toPack( byte ascii ) + { + for( Map.Entry entry: basesByPack.entrySet() ) { + if( entry.getValue().equals(ascii) ) + return entry.getKey(); + } + throw new ReviewedStingException(String.format("Base %c is an invalid base to pack", (char)ascii)); + } + + /** + * Convert the ASCII representation of a base to its 'normalized' representation. + * @param base The base itself. + * @return The byte, if present. Null if unknown. + */ + public static Byte fromASCII( byte base ) { + Byte found = null; + for( Byte normalized: allBases ) { + if( normalized.equals(base) ) { + found = normalized; + break; + } + } + return found; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java new file mode 100644 index 000000000..268b11ac4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/Counts.java @@ -0,0 +1,151 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Counts of how many bases of each type have been seen. + * + * @author mhanna + * @version 0.1 + */ +public class Counts implements Cloneable { + /** + * Internal representation of counts, broken down by ASCII value. + */ + private Map counts = new HashMap(); + + /** + * Internal representation of cumulative counts, broken down by ASCII value. + */ + private Map cumulativeCounts = new HashMap(); + + /** + * Create an empty Counts object with values A=0,C=0,G=0,T=0. + */ + public Counts() + { + for(byte base: Bases.instance) { + counts.put(base,0L); + cumulativeCounts.put(base,0L); + } + } + + /** + * Create a counts data structure with the given initial values. + * @param data Count data, broken down by base. + * @param cumulative Whether the counts are cumulative, (count_G=numA+numC+numG,for example). + */ + public Counts( long[] data, boolean cumulative ) { + if(cumulative) { + long priorCount = 0; + for(byte base: Bases.instance) { + long count = data[Bases.toPack(base)]; + counts.put(base,count-priorCount); + cumulativeCounts.put(base,priorCount); + priorCount = count; + } + } + else { + long priorCount = 0; + for(byte base: Bases.instance) { + long count = data[Bases.toPack(base)]; + counts.put(base,count); + cumulativeCounts.put(base,priorCount); + priorCount += count; + } + } + } + + /** + * Convert to an array for persistence. + * @param cumulative Use a cumulative representation. + * @return Array of count values. + */ + public long[] toArray(boolean cumulative) { + long[] countArray = new long[counts.size()]; + if(cumulative) { + int index = 0; + boolean first = true; + for(byte base: Bases.instance) { + if(first) { + first = false; + continue; + } + countArray[index++] = getCumulative(base); + } + countArray[countArray.length-1] = getTotal(); + } + else { + int index = 0; + for(byte base: Bases.instance) + countArray[index++] = counts.get(base); + } + return countArray; + } + + /** + * Create a unique copy of the current object. + * @return A duplicate of this object. + */ + public Counts clone() { + Counts other; + try { + other = (Counts)super.clone(); + } + catch(CloneNotSupportedException ex) { + throw new ReviewedStingException("Unable to clone counts object", ex); + } + other.counts = new HashMap(counts); + other.cumulativeCounts = new HashMap(cumulativeCounts); + return other; + } + + /** + * Increment the number of bases seen at the given location. + * @param base Base to increment. + */ + public void increment(byte base) { + counts.put(base,counts.get(base)+1); + boolean increment = false; + for(byte cumulative: Bases.instance) { + if(increment) cumulativeCounts.put(cumulative,cumulativeCounts.get(cumulative)+1); + increment |= (cumulative == base); + } + } + + /** + * Gets a count of the number of bases seen at a given location. + * Note that counts in this case are not cumulative (counts for A,C,G,T + * are independent). + * @param base Base for which to query counts. + * @return Number of bases of this type seen. + */ + public long get(byte base) { + return counts.get(base); + } + + /** + * Gets a count of the number of bases seen before this base. + * Note that counts in this case are cumulative. + * @param base Base for which to query counts. + * @return Number of bases of this type seen. + */ + public long getCumulative(byte base) { + return cumulativeCounts.get(base); + } + + /** + * How many total bases are represented by this count structure? + * @return Total bases represented. + */ + public long getTotal() { + int accumulator = 0; + for(byte base: Bases.instance) { + accumulator += get(base); + } + return accumulator; + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java new file mode 100755 index 000000000..801ab3a0b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/CreateBWTFromReference.java @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITHoc THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.IOException; + +/** + * Create a suffix array data structure. + * + * @author mhanna + * @version 0.1 + */ +public class CreateBWTFromReference { + private byte[] loadReference( File inputFile ) { + // Read in the first sequence in the input file + ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); + ReferenceSequence sequence = reference.nextSequence(); + return sequence.getBases(); + } + + private byte[] loadReverseReference( File inputFile ) { + ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); + ReferenceSequence sequence = reference.nextSequence(); + PackUtils.reverse(sequence.getBases()); + return sequence.getBases(); + } + + private Counts countOccurrences( byte[] sequence ) { + Counts occurrences = new Counts(); + for( byte base: sequence ) + occurrences.increment(base); + return occurrences; + } + + private long[] createSuffixArray( byte[] sequence ) { + return SuffixArray.createFromReferenceSequence(sequence).sequence; + } + + private long[] invertSuffixArray( long[] suffixArray ) { + long[] inverseSuffixArray = new long[suffixArray.length]; + for( int i = 0; i < suffixArray.length; i++ ) + inverseSuffixArray[(int)suffixArray[i]] = i; + return inverseSuffixArray; + } + + private long[] createCompressedSuffixArray( int[] suffixArray, int[] inverseSuffixArray ) { + long[] compressedSuffixArray = new long[suffixArray.length]; + compressedSuffixArray[0] = inverseSuffixArray[0]; + for( int i = 1; i < suffixArray.length; i++ ) + compressedSuffixArray[i] = inverseSuffixArray[suffixArray[i]+1]; + return compressedSuffixArray; + } + + private long[] createInversedCompressedSuffixArray( int[] compressedSuffixArray ) { + long[] inverseCompressedSuffixArray = new long[compressedSuffixArray.length]; + for( int i = 0; i < compressedSuffixArray.length; i++ ) + inverseCompressedSuffixArray[compressedSuffixArray[i]] = i; + return inverseCompressedSuffixArray; + } + + public static void main( String argv[] ) throws IOException { + if( argv.length != 5 ) { + System.out.println("USAGE: CreateBWTFromReference .fasta "); + return; + } + + String inputFileName = argv[0]; + File inputFile = new File(inputFileName); + + String bwtFileName = argv[1]; + File bwtFile = new File(bwtFileName); + + String rbwtFileName = argv[2]; + File rbwtFile = new File(rbwtFileName); + + String saFileName = argv[3]; + File saFile = new File(saFileName); + + String rsaFileName = argv[4]; + File rsaFile = new File(rsaFileName); + + CreateBWTFromReference creator = new CreateBWTFromReference(); + + byte[] sequence = creator.loadReference(inputFile); + byte[] reverseSequence = creator.loadReverseReference(inputFile); + + // Count the occurences of each given base. + Counts occurrences = creator.countOccurrences(sequence); + System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences.getCumulative(Bases.A), + occurrences.getCumulative(Bases.C), + occurrences.getCumulative(Bases.G), + occurrences.getCumulative(Bases.T)); + + // Generate the suffix array and print diagnostics. + long[] suffixArrayData = creator.createSuffixArray(sequence); + long[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence); + + // Invert the suffix array and print diagnostics. + long[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData); + long[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData); + + SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData ); + SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData ); + + /* + // Create the data structure for the compressed suffix array and print diagnostics. + int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray); + int reconstructedInverseSA = compressedSuffixArray[0]; + for( int i = 0; i < 8; i++ ) { + System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA); + reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA]; + } + + // Create the data structure for the inverse compressed suffix array and print diagnostics. + int[] inverseCompressedSuffixArray = creator.createInversedCompressedSuffixArray(compressedSuffixArray); + for( int i = 0; i < 8; i++ ) { + System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]); + } + */ + + // Create the BWT. + BWT bwt = BWT.createFromReferenceSequence(sequence); + BWT reverseBWT = BWT.createFromReferenceSequence(reverseSequence); + + byte[] bwtSequence = bwt.getSequence(); + System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length()); + + BWTWriter bwtWriter = new BWTWriter(bwtFile); + bwtWriter.write(bwt); + bwtWriter.close(); + + BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile); + reverseBWTWriter.write(reverseBWT); + reverseBWTWriter.close(); + + /* + SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); + saWriter.write(suffixArray); + saWriter.close(); + + SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile); + reverseSAWriter.write(reverseSuffixArray); + reverseSAWriter.close(); + */ + + File existingBWTFile = new File(inputFileName+".bwt"); + BWTReader existingBWTReader = new BWTReader(existingBWTFile); + BWT existingBWT = existingBWTReader.read(); + + byte[] existingBWTSequence = existingBWT.getSequence(); + System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length()); + + for( int i = 0; i < bwt.length(); i++ ) { + if( bwtSequence[i] != existingBWTSequence[i] ) + throw new ReviewedStingException("BWT mismatch at " + i); + } + + File existingSAFile = new File(inputFileName+".sa"); + SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile,existingBWT); + SuffixArray existingSuffixArray = existingSuffixArrayReader.read(); + + for(int i = 0; i < suffixArray.length(); i++) { + if( i % 10000 == 0 ) + System.out.printf("Validating suffix array entry %d%n", i); + if( suffixArray.get(i) != existingSuffixArray.get(i) ) + throw new ReviewedStingException(String.format("Suffix array mismatch at %d; SA is %d; should be %d",i,existingSuffixArray.get(i),suffixArray.get(i))); + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java new file mode 100644 index 000000000..13714de1e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SequenceBlock.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +/** + * Models a block of bases within the BWT. + */ +public class SequenceBlock { + /** + * Start position of this sequence within the BWT. + */ + public final int sequenceStart; + + /** + * Length of this sequence within the BWT. + */ + public final int sequenceLength; + + + /** + * Occurrences of each letter up to this sequence block. + */ + public final Counts occurrences; + + /** + * Sequence for this segment. + */ + public final byte[] sequence; + + /** + * Create a new block within this BWT. + * @param sequenceStart Starting position of this sequence within the BWT. + * @param sequenceLength Length of this sequence. + * @param occurrences How many of each base has been seen before this sequence began. + * @param sequence The actual sequence from the BWT. + */ + public SequenceBlock( int sequenceStart, int sequenceLength, Counts occurrences, byte[] sequence ) { + this.sequenceStart = sequenceStart; + this.sequenceLength = sequenceLength; + this.occurrences = occurrences; + this.sequence = sequence; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java new file mode 100644 index 000000000..49af98bb9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArray.java @@ -0,0 +1,158 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Comparator; +import java.util.TreeSet; + +/** + * An in-memory representation of a suffix array. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArray { + public final long inverseSA0; + public final Counts occurrences; + + /** + * The elements of the sequence actually stored in memory. + */ + protected final long[] sequence; + + /** + * How often are individual elements in the sequence actually stored + * in memory, as opposed to being calculated on the fly? + */ + protected final int sequenceInterval; + + /** + * The BWT used to calculate missing portions of the sequence. + */ + protected final BWT bwt; + + public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence) { + this(inverseSA0,occurrences,sequence,1,null); + } + + /** + * Creates a new sequence array with the given inverse SA, occurrences, and values. + * @param inverseSA0 Inverse SA entry for the first element. + * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. + * @param sequence The full suffix array. + * @param sequenceInterval How frequently is the sequence interval stored. + * @param bwt bwt used to infer the remaining entries in the BWT. + */ + public SuffixArray(long inverseSA0, Counts occurrences, long[] sequence, int sequenceInterval, BWT bwt) { + this.inverseSA0 = inverseSA0; + this.occurrences = occurrences; + this.sequence = sequence; + this.sequenceInterval = sequenceInterval; + this.bwt = bwt; + + if(sequenceInterval != 1 && bwt == null) + throw new ReviewedStingException("A BWT must be provided if the sequence interval is not 1"); + } + + /** + * Retrieves the length of the sequence array. + * @return Length of the suffix array. + */ + public long length() { + if( bwt != null ) + return bwt.length()+1; + else + return sequence.length; + } + + /** + * Get the suffix array value at a given sequence. + * @param index Index at which to retrieve the suffix array vaule. + * @return The suffix array value at that entry. + */ + public long get(long index) { + int iterations = 0; + while(index%sequenceInterval != 0) { + // The inverseSA0 ('$') doesn't have a usable ASCII representation; it must be treated as a special case. + if(index == inverseSA0) + index = 0; + else { + byte base = bwt.getBase(index); + index = bwt.counts(base) + bwt.occurrences(base,index); + } + iterations++; + } + return (sequence[(int)(index/sequenceInterval)]+iterations) % length(); + } + + /** + * Create a suffix array from a given reference sequence. + * @param sequence The reference sequence to use when building the suffix array. + * @return a constructed suffix array. + */ + public static SuffixArray createFromReferenceSequence(byte[] sequence) { + // The builder for the suffix array. Use an integer in this case because + // Java arrays can only hold an integer. + TreeSet suffixArrayBuilder = new TreeSet(new SuffixArrayComparator(sequence)); + + Counts occurrences = new Counts(); + for( byte base: sequence ) + occurrences.increment(base); + + // Build out the suffix array using a custom comparator. + for( int i = 0; i <= sequence.length; i++ ) + suffixArrayBuilder.add(i); + + // Copy the suffix array into an array. + long[] suffixArray = new long[suffixArrayBuilder.size()]; + int i = 0; + for( Integer element: suffixArrayBuilder ) + suffixArray[i++] = element; + + // Find the first element in the inverse suffix array. + long inverseSA0 = -1; + for(i = 0; i < suffixArray.length; i++) { + if(suffixArray[i] == 0) + inverseSA0 = i; + } + if(inverseSA0 < 0) + throw new ReviewedStingException("Unable to find first inverse SA entry in generated suffix array."); + + return new SuffixArray(inverseSA0,occurrences,suffixArray); + } + + /** + * Compares two suffix arrays of the given sequence. Will return whichever string appears + * first in lexicographic order. + */ + private static class SuffixArrayComparator implements Comparator { + /** + * The data source for all suffix arrays. + */ + private final String sequence; + + /** + * Create a new comparator. + * @param sequence Reference sequence to use as basis for comparison. + */ + public SuffixArrayComparator( byte[] sequence ) { + // Processing the suffix array tends to be easier as a string. + this.sequence = StringUtil.bytesToString(sequence); + } + + /** + * Compare the two given suffix arrays. Criteria for comparison is the lexicographic order of + * the two substrings sequence[lhs:], sequence[rhs:]. + * @param lhs Left-hand side of comparison. + * @param rhs Right-hand side of comparison. + * @return How the suffix arrays represented by lhs, rhs compare. + */ + public int compare( Integer lhs, Integer rhs ) { + String lhsSuffixArray = sequence.substring(lhs); + String rhsSuffixArray = sequence.substring(rhs); + return lhsSuffixArray.compareTo(rhsSuffixArray); + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java new file mode 100644 index 000000000..b48e4c69c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayReader.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.alignment.reference.packing.PackUtils; +import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteOrder; + +/** + * A reader for suffix arrays in permanent storage. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArrayReader { + /** + * Input stream from which to read suffix array data. + */ + private FileInputStream inputStream; + + /** + * BWT to use to fill in missing data. + */ + private BWT bwt; + + /** + * Create a new suffix array reader. + * @param inputFile File in which the suffix array is stored. + * @param bwt BWT to use when filling in missing data. + */ + public SuffixArrayReader(File inputFile, BWT bwt) { + try { + this.inputStream = new FileInputStream(inputFile); + this.bwt = bwt; + } + catch( FileNotFoundException ex ) { + throw new ReviewedStingException("Unable to open input file", ex); + } + } + + /** + * Read a suffix array from the input stream. + * @return The suffix array stored in the input stream. + */ + public SuffixArray read() { + UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); + + long inverseSA0; + long[] occurrences; + long[] suffixArray; + int suffixArrayInterval; + + try { + inverseSA0 = uintPackedInputStream.read(); + occurrences = new long[PackUtils.ALPHABET_SIZE]; + uintPackedInputStream.read(occurrences); + // Throw away the suffix array size in bytes and use the occurrences table directly. + suffixArrayInterval = (int)uintPackedInputStream.read(); + suffixArray = new long[(int)((occurrences[occurrences.length-1]+suffixArrayInterval-1)/suffixArrayInterval)]; + uintPackedInputStream.read(suffixArray); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to read BWT from input stream.", ex); + } + + return new SuffixArray(inverseSA0, new Counts(occurrences,true), suffixArray, suffixArrayInterval, bwt); + } + + + /** + * Close the input stream. + */ + public void close() { + try { + inputStream.close(); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to close input file", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java new file mode 100644 index 000000000..b6f79be2f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/SuffixArrayWriter.java @@ -0,0 +1,67 @@ +package org.broadinstitute.sting.alignment.reference.bwt; + +import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedOutputStream; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * Javadoc goes here. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArrayWriter { + /** + * Input stream from which to read suffix array data. + */ + private OutputStream outputStream; + + /** + * Create a new suffix array reader. + * @param outputFile File in which the suffix array is stored. + */ + public SuffixArrayWriter( File outputFile ) { + try { + this.outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); + } + catch( FileNotFoundException ex ) { + throw new ReviewedStingException("Unable to open input file", ex); + } + } + + /** + * Write a suffix array to the output stream. + * @param suffixArray suffix array to write. + */ + public void write(SuffixArray suffixArray) { + UnsignedIntPackedOutputStream uintPackedOutputStream = new UnsignedIntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); + + try { + uintPackedOutputStream.write(suffixArray.inverseSA0); + uintPackedOutputStream.write(suffixArray.occurrences.toArray(true)); + // How frequently the suffix array entry is placed. + uintPackedOutputStream.write(1); + // Length of the suffix array. + uintPackedOutputStream.write(suffixArray.length()-1); + uintPackedOutputStream.write(suffixArray.sequence,1,suffixArray.sequence.length-1); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to read BWT from input stream.", ex); + } + } + + + /** + * Close the input stream. + */ + public void close() { + try { + outputStream.close(); + } + catch( IOException ex ) { + throw new ReviewedStingException("Unable to close input file", ex); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java new file mode 100644 index 000000000..174a9853b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedInputStream.java @@ -0,0 +1,95 @@ +package org.broadinstitute.sting.alignment.reference.packing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; + +/** + * Reads a packed version of the input stream. + * + * @author mhanna + * @version 0.1 + */ +public class BasePackedInputStream { + /** + * Type of object to unpack. + */ + private final Class type; + + /** + * Ultimate source for packed bases. + */ + private final FileInputStream targetInputStream; + + /** + * Channel source for packed bases. + */ + private final FileChannel targetInputChannel; + + /** + * A fixed-size buffer for word-packed data. + */ + private final ByteOrder byteOrder; + + /** + * How many bases are in a given packed word. + */ + private final int basesPerPackedWord = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE; + + /** + * How many bytes in an integer? + */ + private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE; + + + public BasePackedInputStream( Class type, File inputFile, ByteOrder byteOrder ) throws FileNotFoundException { + this(type,new FileInputStream(inputFile),byteOrder); + } + + public BasePackedInputStream( Class type, FileInputStream inputStream, ByteOrder byteOrder ) { + if( type != Integer.class ) + throw new ReviewedStingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName()); + this.type = type; + this.targetInputStream = inputStream; + this.targetInputChannel = inputStream.getChannel(); + this.byteOrder = byteOrder; + } + + /** + * Read the entire contents of the input stream. + * @param bwt array into which bases should be read. + * @throws IOException if an I/O error occurs. + */ + public void read(byte[] bwt) throws IOException { + read(bwt,0,bwt.length); + } + + /** + * Read the next length bases into the bwt array, starting at the given offset. + * @param bwt array holding the given data. + * @param offset target position in the bases array into which bytes should be written. + * @param length number of bases to read from the stream. + * @throws IOException if an I/O error occurs. + */ + public void read(byte[] bwt, int offset, int length) throws IOException { + int bufferWidth = ((bwt.length+basesPerPackedWord-1)/basesPerPackedWord)*bytesPerInteger; + ByteBuffer buffer = ByteBuffer.allocate(bufferWidth).order(byteOrder); + targetInputChannel.read(buffer); + targetInputChannel.position(targetInputChannel.position()+buffer.remaining()); + buffer.flip(); + + int packedWord = 0; + int i = 0; + while(i < length) { + if(i % basesPerPackedWord == 0) packedWord = buffer.getInt(); + int position = basesPerPackedWord - i%basesPerPackedWord - 1; + bwt[offset+i++] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3)); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java new file mode 100644 index 000000000..c62f40e51 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/BasePackedOutputStream.java @@ -0,0 +1,140 @@ +package org.broadinstitute.sting.alignment.reference.packing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * A general-purpose stream for writing packed bases. + * + * @author mhanna + * @version 0.1 + */ +public class BasePackedOutputStream { + /** + * Type of object to pack. + */ + private final Class type; + + /** + * How many bases can be stored in the given data structure? + */ + private final int basesPerType; + + /** + * Ultimate target for the packed bases. + */ + private final OutputStream targetOutputStream; + + /** + * A fixed-size buffer for word-packed data. + */ + private final ByteBuffer buffer; + + public BasePackedOutputStream( Class type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException { + this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder); + } + + /** + * Write packed bases to the given output stream. + * @param type Type of data to pack bases into. + * @param outputStream Output stream to which to write packed bases. + * @param byteOrder Switch between big endian / little endian when reading / writing files. + */ + public BasePackedOutputStream( Class type, OutputStream outputStream, ByteOrder byteOrder) { + this.targetOutputStream = outputStream; + this.type = type; + basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE; + this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder); + } + + /** + * Writes the given base to the output stream. Will write only this base; no packing will be performed. + * @param base List of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( int base ) throws IOException { + write( new byte[] { (byte)base } ); + } + + /** + * Writes an array of bases to the target output stream. + * @param bases List of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases ) throws IOException { + write(bases,0,bases.length); + } + + /** + * Writes a subset of the array of bases to the output stream. + * @param bases List of bases to write. + * @param offset site at which to start writing. + * @param length number of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases, int offset, int length ) throws IOException { + int packedBases = 0; + int positionInPack = 0; + + for( int base = offset; base < offset+length; base++ ) { + packedBases = packBase(bases[base], packedBases, positionInPack); + + // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. + positionInPack = ++positionInPack % basesPerType; + if( positionInPack == 0 ) { + writePackedBases(packedBases); + packedBases = 0; + } + } + + if( positionInPack > 0 ) + writePackedBases(packedBases); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + targetOutputStream.close(); + } + + /** + * Pack the given base into the basepack. + * @param base The base to pack. + * @param basePack Target for the pack operation. + * @param position Position within the pack to which to add the base. + * @return The packed integer. + */ + private int packBase( byte base, int basePack, int position ) { + basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1)); + return basePack; + } + + /** + * Write the given packed base structure to the output file. + * @param packedBases Packed bases to write. + * @throws IOException on error writing to the file. + */ + private void writePackedBases(int packedBases) throws IOException { + buffer.rewind(); + if( type == Integer.class ) + buffer.putInt(packedBases); + else if( type == Byte.class ) + buffer.put((byte)packedBases); + else + throw new ReviewedStingException("Cannot pack bases into type " + type.getName()); + targetOutputStream.write(buffer.array()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java new file mode 100755 index 000000000..561535e29 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/CreatePACFromReference.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment.reference.packing; + +import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; + +import java.io.File; +import java.io.IOException; + +/** + * Generate a .PAC file from a given reference. + * + * @author hanna + * @version 0.1 + */ + +public class CreatePACFromReference { + public static void main( String argv[] ) throws IOException { + if( argv.length != 3 ) { + System.out.println("USAGE: CreatePACFromReference .fasta "); + return; + } + + // Read in the first sequence in the input file + String inputFileName = argv[0]; + File inputFile = new File(inputFileName); + ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); + ReferenceSequence sequence = reference.nextSequence(); + + // Target file for output + PackUtils.writeReferenceSequence( new File(argv[1]), sequence.getBases() ); + + // Reverse the bases in the reference + PackUtils.reverse(sequence.getBases()); + + // Target file for output + PackUtils.writeReferenceSequence( new File(argv[2]), sequence.getBases() ); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java new file mode 100644 index 000000000..972e31cf0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/PackUtils.java @@ -0,0 +1,135 @@ +package org.broadinstitute.sting.alignment.reference.packing; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteOrder; + +/** + * Utilities designed for packing / unpacking bases. + * + * @author mhanna + * @version 0.1 + */ +public class PackUtils { + /** + * How many possible bases can be encoded? + */ + public static final int ALPHABET_SIZE = 4; + + /** + * How many bits does it take to store a single base? + */ + public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2)); + + /** + * How many bits fit into a single byte? + */ + public static final int BITS_PER_BYTE = 8; + + /** + * Writes a reference sequence to a PAC file. + * @param outputFile Filename for the PAC file. + * @param referenceSequence Reference sequence to write. + * @throws IOException If there's a problem writing to the output file. + */ + public static void writeReferenceSequence( File outputFile, byte[] referenceSequence ) throws IOException { + OutputStream outputStream = new FileOutputStream(outputFile); + + BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); + basePackedOutputStream.write(referenceSequence); + + outputStream.write(referenceSequence.length%PackUtils.ALPHABET_SIZE); + + outputStream.close(); + } + + + /** + * How many bits can a given type hold? + * @param type Type to test. + * @return Number of bits that the given type can hold. + */ + public static int bitsInType( Class type ) { + try { + long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1; + long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1; + if( typeSize > intTypeSize ) + throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName()); + return (int)(Math.log(typeSize)/Math.log(2)); + } + catch( NoSuchFieldException ex ) { + throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex); + } + catch( IllegalAccessException ex ) { + throw new ReviewedStingException("Cannot determine number of bits available in type: " + type.getName(),ex); + } + } + + /** + * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. + * @param base ASCII value for the base to pack. + * @return A byte from 0-3 indicating the base's packed value. + */ + public static byte packBase(byte base) { + switch( base ) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + throw new ReviewedStingException("Unknown base type: " + base); + } + } + + /** + * Converts a two-bit representation of a base into an ASCII representation of a base. + * @param pack Byte from 0-3 indicating which base is represented. + * @return An ASCII value representing the packed base. + */ + public static byte unpackBase(byte pack) { + switch( pack ) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + throw new ReviewedStingException("Unknown pack type: " + pack); + } + } + + /** + * Reverses an unpacked sequence of bases. + * @param bases bases to reverse. + */ + public static void reverse( byte[] bases ) { + for( int i = 0, j = bases.length-1; i < j; i++, j-- ) { + byte temp = bases[j]; + bases[j] = bases[i]; + bases[i] = temp; + } + } + + /** + * Given a structure of size size that should be split + * into partitionSize partitions, how many partitions should + * be created? Size of last partition will be <= partitionSize. + * @param size Total size of the data structure. + * @param partitionSize Size of an individual partition. + * @return Number of partitions that would be created. + */ + public static int numberOfPartitions( long size, long partitionSize ) { + return (int)((size+partitionSize-1) / partitionSize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java new file mode 100644 index 000000000..999e54451 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedInputStream.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.alignment.reference.packing; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; + +/** + * Read a set of integers packed into + * + * @author mhanna + * @version 0.1 + */ +public class UnsignedIntPackedInputStream { + /** + * Ultimate target for the occurrence array. + */ + private final FileInputStream targetInputStream; + + /** + * Target channel from which to pull file data. + */ + private final FileChannel targetInputChannel; + + /** + * The byte order in which integer input data appears. + */ + private final ByteOrder byteOrder; + + /** + * How many bytes are required to store an integer? + */ + private final int bytesPerInteger = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE; + + /** + * Create a new PackedIntInputStream, writing to the given target file. + * @param inputFile target input file. + * @param byteOrder Endianness to use when writing a list of integers. + * @throws java.io.IOException if an I/O error occurs. + */ + public UnsignedIntPackedInputStream(File inputFile, ByteOrder byteOrder) throws IOException { + this(new FileInputStream(inputFile),byteOrder); + } + + /** + * Read ints from the given InputStream. + * @param inputStream Input stream from which to read ints. + * @param byteOrder Endianness to use when writing a list of integers. + */ + public UnsignedIntPackedInputStream(FileInputStream inputStream, ByteOrder byteOrder) { + this.targetInputStream = inputStream; + this.targetInputChannel = inputStream.getChannel(); + this.byteOrder = byteOrder; + } + + /** + * Read a datum from the input stream. + * @return The next input datum in the stream. + * @throws IOException if an I/O error occurs. + */ + public long read() throws IOException { + long[] data = new long[1]; + read(data); + return data[0]; + } + + /** + * Read the data from the input stream. + * @param data placeholder for input data. + * @throws IOException if an I/O error occurs. + */ + public void read( long[] data ) throws IOException { + read( data, 0, data.length ); + } + + /** + * Read the data from the input stream, starting at the given offset. + * @param data placeholder for input data. + * @param offset place in the array to start reading in data. + * @param length number of ints to read in. + * @throws IOException if an I/O error occurs. + */ + public void read( long[] data, int offset, int length ) throws IOException { + ByteBuffer readBuffer = ByteBuffer.allocate(bytesPerInteger*length).order(byteOrder); + + targetInputChannel.read(readBuffer,targetInputChannel.position()); + readBuffer.flip(); + targetInputChannel.position(targetInputChannel.position()+readBuffer.remaining()); + + int i = 0; + while(i < length) + data[offset+i++] = readBuffer.getInt() & 0xFFFFFFFFL; + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + targetInputStream.close(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java new file mode 100755 index 000000000..b02024366 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/alignment/reference/packing/UnsignedIntPackedOutputStream.java @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.alignment.reference.packing; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Writes an list of integers to the output file. + * + * @author mhanna + * @version 0.1 + */ +public class UnsignedIntPackedOutputStream { + /** + * Ultimate target for the occurrence array. + */ + private final OutputStream targetOutputStream; + + /** + * A fixed-size buffer for int-packed data. + */ + private final ByteBuffer buffer; + + /** + * Create a new PackedIntOutputStream, writing to the given target file. + * @param outputFile target output file. + * @param byteOrder Endianness to use when writing a list of integers. + * @throws IOException if an I/O error occurs. + */ + public UnsignedIntPackedOutputStream(File outputFile, ByteOrder byteOrder) throws IOException { + this(new FileOutputStream(outputFile),byteOrder); + } + + /** + * Write packed ints to the given OutputStream. + * @param outputStream Output stream to which to write packed ints. + * @param byteOrder Endianness to use when writing a list of integers. + */ + public UnsignedIntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) { + this.targetOutputStream = outputStream; + buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder); + } + + /** + * Write the data to the output stream. + * @param datum datum to write. + * @throws IOException if an I/O error occurs. + */ + public void write( long datum ) throws IOException { + buffer.rewind(); + buffer.putInt((int)datum); + targetOutputStream.write(buffer.array()); + } + + /** + * Write the data to the output stream. + * @param data data to write. occurrences.length must match alphabet size. + * @throws IOException if an I/O error occurs. + */ + public void write( long[] data ) throws IOException { + for(long datum: data) + write(datum); + } + + /** + * Write the given chunk of data to the input stream. + * @param data data to write. + * @param offset position at which to start. + * @param length number of ints to write. + * @throws IOException if an I/O error occurs. + */ + public void write( long[] data, int offset, int length ) throws IOException { + for( int i = offset; i < offset+length; i++ ) + write(data[i]); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + targetOutputStream.close(); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java index 9f92df6e0..8e3f753a8 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentDefinitions.java @@ -174,7 +174,8 @@ public class ArgumentDefinitions implements Iterable { static DefinitionMatcher VerifiableDefinitionMatcher = new DefinitionMatcher() { public boolean matches( ArgumentDefinition definition, Object key ) { - return definition.validation != null; + // We can perform some sort of validation for anything that isn't a flag. + return !definition.isFlag; } }; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java index 60ed8c899..351583c07 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java @@ -44,7 +44,7 @@ public class ArgumentMatch implements Iterable { public final String label; /** - * Maps indicies of command line arguments to values paired with that argument. + * Maps indices of command line arguments to values paired with that argument. */ public final SortedMap> indices = new TreeMap>(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index aba4fc109..d88e7030e 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -43,7 +43,7 @@ import java.util.Locale; public abstract class CommandLineProgram { /** The command-line program and the arguments it returned. */ - protected ParsingEngine parser = null; + public ParsingEngine parser = null; /** the default log level */ @Argument(fullName = "logging_level", @@ -144,6 +144,11 @@ public abstract class CommandLineProgram { public static int result = -1; + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args) throws Exception { + start(clp, args, false); + } + /** * This function is called to start processing the command line, and kick * off the execute message of the program. @@ -153,7 +158,7 @@ public abstract class CommandLineProgram { * @throws Exception when an exception occurs */ @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args) throws Exception { + public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { try { // setup our log layout @@ -180,8 +185,9 @@ public abstract class CommandLineProgram { // - InvalidArgument in case these arguments are specified by plugins. // - MissingRequiredArgument in case the user requested help. Handle that later, once we've // determined the full complement of arguments. - parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, - ParsingEngine.ValidationType.InvalidArgument)); + if ( ! dryRun ) + parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, + ParsingEngine.ValidationType.InvalidArgument)); parser.loadArgumentsIntoObject(clp); // Initialize the logger using the loaded command line. @@ -195,36 +201,40 @@ public abstract class CommandLineProgram { if (isHelpPresent(parser)) printHelpAndExit(clp, parser); - parser.validate(); + if ( ! dryRun ) parser.validate(); } else { parser.parse(args); - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); + if ( ! dryRun ) { + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); - parser.validate(); + parser.validate(); + } parser.loadArgumentsIntoObject(clp); // Initialize the logger using the loaded command line. clp.setupLoggerLevel(layout); } - // if they specify a log location, output our data there - if (clp.toFile != null) { - FileAppender appender; - try { - appender = new FileAppender(layout, clp.toFile, false); - logger.addAppender(appender); - } catch (IOException e) { - throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + if ( ! dryRun ) { + // if they specify a log location, output our data there + if (clp.toFile != null) { + FileAppender appender; + try { + appender = new FileAppender(layout, clp.toFile, false); + logger.addAppender(appender); + } catch (IOException e) { + throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + } } + + // regardless of what happens next, generate the header information + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args); + + // call the execute + CommandLineProgram.result = clp.execute(); } - - // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args); - - // call the execute - CommandLineProgram.result = clp.execute(); } catch (ArgumentException e) { clp.parser.printHelp(clp.getApplicationDetails()); diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index 8423bb2f2..b7efcd278 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -41,11 +41,16 @@ import java.util.*; * A parser for Sting command-line arguments. */ public class ParsingEngine { + /** + * The loaded argument sources along with their back definitions. + */ + private Map argumentSourcesByDefinition = new HashMap(); + /** * A list of defined arguments against which command lines are matched. * Package protected for testing access. */ - ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); + public ArgumentDefinitions argumentDefinitions = new ArgumentDefinitions(); /** * A list of matches from defined arguments to command-line text. @@ -107,8 +112,13 @@ public class ParsingEngine { */ public void addArgumentSource( String sourceName, Class sourceClass ) { List argumentsFromSource = new ArrayList(); - for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) - argumentsFromSource.addAll( argumentSource.createArgumentDefinitions() ); + for( ArgumentSource argumentSource: extractArgumentSources(sourceClass) ) { + List argumentDefinitions = argumentSource.createArgumentDefinitions(); + for(ArgumentDefinition argumentDefinition: argumentDefinitions) { + argumentSourcesByDefinition.put(argumentDefinition,argumentSource); + argumentsFromSource.add( argumentDefinition ); + } + } argumentDefinitions.add( new ArgumentDefinitionGroup(sourceName, argumentsFromSource) ); } @@ -199,16 +209,25 @@ public class ParsingEngine { throw new InvalidArgumentException( invalidArguments ); } - // Find invalid argument values (arguments that fail the regexp test. + // Find invalid argument values -- invalid arguments are either completely missing or fail the specified 'validation' regular expression. if( !skipValidationOf.contains(ValidationType.InvalidArgumentValue) ) { Collection verifiableArguments = argumentDefinitions.findArgumentDefinitions( null, ArgumentDefinitions.VerifiableDefinitionMatcher ); Collection> invalidValues = new ArrayList>(); for( ArgumentDefinition verifiableArgument: verifiableArguments ) { ArgumentMatches verifiableMatches = argumentMatches.findMatches( verifiableArgument ); + // Check to see whether an argument value was specified. Argument values must be provided + // when the argument name is specified and the argument is not a flag type. + for(ArgumentMatch verifiableMatch: verifiableMatches) { + ArgumentSource argumentSource = argumentSourcesByDefinition.get(verifiableArgument); + if(verifiableMatch.values().size() == 0 && !verifiableArgument.isFlag && argumentSource.createsTypeDefault()) + invalidValues.add(new Pair(verifiableArgument,null)); + } + + // Ensure that the field contents meet the validation criteria specified by the regular expression. for( ArgumentMatch verifiableMatch: verifiableMatches ) { for( String value: verifiableMatch.values() ) { - if( !value.matches(verifiableArgument.validation) ) + if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) ) invalidValues.add( new Pair(verifiableArgument, value) ); } } @@ -515,10 +534,14 @@ class InvalidArgumentValueException extends ArgumentException { private static String formatArguments( Collection> invalidArgumentValues ) { StringBuilder sb = new StringBuilder(); for( Pair invalidValue: invalidArgumentValues ) { - sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", - invalidValue.first.fullName, - invalidValue.second, - invalidValue.first.validation) ); + if(invalidValue.getSecond() == null) + sb.append( String.format("%nArgument '--%s' requires a value but none was provided", + invalidValue.first.fullName) ); + else + sb.append( String.format("%nArgument '--%s' has value of incorrect format: %s (should match %s)", + invalidValue.first.fullName, + invalidValue.second, + invalidValue.first.validation) ); } return sb.toString(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index da2be74bf..2af29ea70 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -30,25 +30,27 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; /** - * @author aaron - * @version 1.0 - * @date May 8, 2009 - *

- * Class CommandLineGATK - *

+ * The GATK engine itself. Manages map/reduce data access and runs walkers. + * * We run command line GATK programs using this class. It gets the command line args, parses them, and hands the * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here, * the gatk engine should deal with any data related information. */ +@DocumentedGATKFeature( + groupName = "GATK Engine", + summary = "Features and arguments for the GATK engine itself, available to all walkers.", + extraDocs = { ReadFilter.class, UserException.class }) public class CommandLineGATK extends CommandLineExecutable { @Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run") private String analysisName = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index cf190835e..6aeb42faa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -33,9 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.DescriptionTaglet; -import org.broadinstitute.sting.utils.help.DisplayNameTaglet; -import org.broadinstitute.sting.utils.help.SummaryTaglet; +import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -82,19 +80,10 @@ public class WalkerManager extends PluginManager { * @return A suitable display name for the package. */ public String getPackageDisplayName(String packageName) { - // Try to find an override for the display name of this package. - String displayNameKey = String.format("%s.%s",packageName,DisplayNameTaglet.NAME); - String displayName; - if(helpText.containsKey(displayNameKey)) { - displayName = helpText.getString(displayNameKey); - } - else { - // If no override exists... - // ...try to compute the override from the text of the package name, while accounting for - // unpackaged walkers. - displayName = packageName.substring(packageName.lastIndexOf('.')+1); - if(displayName.trim().equals("")) displayName = ""; - } + // ...try to compute the override from the text of the package name, while accounting for + // unpackaged walkers. + String displayName = packageName.substring(packageName.lastIndexOf('.')+1); + if (displayName.trim().equals("")) displayName = ""; return displayName; } @@ -104,7 +93,7 @@ public class WalkerManager extends PluginManager { * @return Package help text, or "" if none exists. */ public String getPackageSummaryText(String packageName) { - String key = String.format("%s.%s",packageName,SummaryTaglet.NAME); + String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); if(!helpText.containsKey(key)) return ""; return helpText.getString(key); @@ -116,7 +105,7 @@ public class WalkerManager extends PluginManager { * @return Walker summary description, or "" if none exists. */ public String getWalkerSummaryText(Class walkerType) { - String walkerSummary = String.format("%s.%s",walkerType.getName(), SummaryTaglet.NAME); + String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); if(!helpText.containsKey(walkerSummary)) return ""; return helpText.getString(walkerSummary); @@ -137,7 +126,7 @@ public class WalkerManager extends PluginManager { * @return Walker full description, or "" if none exists. */ public String getWalkerDescriptionText(Class walkerType) { - String walkerDescription = String.format("%s.%s",walkerType.getName(), DescriptionTaglet.NAME); + String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); if(!helpText.containsKey(walkerDescription)) return ""; return helpText.getString(walkerDescription); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 6064806f3..572970349 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -893,6 +893,7 @@ public class SAMDataSource { * Custom representation of interval bounds. * Makes it simpler to track current position. */ + private int[] intervalContigIndices; private int[] intervalStarts; private int[] intervalEnds; @@ -917,12 +918,14 @@ public class SAMDataSource { if(foundMappedIntervals) { if(keepOnlyUnmappedReads) throw new ReviewedStingException("Tried to apply IntervalOverlapFilteringIterator to a mixed of mapped and unmapped intervals. Please apply this filter to only mapped or only unmapped reads"); + this.intervalContigIndices = new int[intervals.size()]; this.intervalStarts = new int[intervals.size()]; this.intervalEnds = new int[intervals.size()]; int i = 0; for(GenomeLoc interval: intervals) { - intervalStarts[i] = (int)interval.getStart(); - intervalEnds[i] = (int)interval.getStop(); + intervalContigIndices[i] = interval.getContigIndex(); + intervalStarts[i] = interval.getStart(); + intervalEnds[i] = interval.getStop(); i++; } } @@ -961,11 +964,10 @@ public class SAMDataSource { while(nextRead == null && (keepOnlyUnmappedReads || currentBound < intervalStarts.length)) { if(!keepOnlyUnmappedReads) { // Mapped read filter; check against GenomeLoc-derived bounds. - if(candidateRead.getAlignmentEnd() >= intervalStarts[currentBound] || - (candidateRead.getReadUnmappedFlag() && candidateRead.getAlignmentStart() >= intervalStarts[currentBound])) { - // This read ends after the current interval begins (or, if unmapped, starts within the bounds of the interval. + if(readEndsOnOrAfterStartingBound(candidateRead)) { + // This read ends after the current interval begins. // Promising, but this read must be checked against the ending bound. - if(candidateRead.getAlignmentStart() <= intervalEnds[currentBound]) { + if(readStartsOnOrBeforeEndingBound(candidateRead)) { // Yes, this read is within both bounds. This must be our next read. nextRead = candidateRead; break; @@ -993,6 +995,37 @@ public class SAMDataSource { candidateRead = iterator.next(); } } + + /** + * Check whether the read lies after the start of the current bound. If the read is unmapped but placed, its + * end will be distorted, so rely only on the alignment start. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readEndsOnOrAfterStartingBound(final SAMRecord read) { + return + // Read ends on a later contig, or... + read.getReferenceIndex() > intervalContigIndices[currentBound] || + // Read ends of this contig... + (read.getReferenceIndex() == intervalContigIndices[currentBound] && + // either after this location, or... + (read.getAlignmentEnd() >= intervalStarts[currentBound] || + // read is unmapped but positioned and alignment start is on or after this start point. + (read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound]))); + } + + /** + * Check whether the read lies before the end of the current bound. + * @param read The read to position-check. + * @return True if the read starts after the current bounds. False otherwise. + */ + private boolean readStartsOnOrBeforeEndingBound(final SAMRecord read) { + return + // Read starts on a prior contig, or... + read.getReferenceIndex() < intervalContigIndices[currentBound] || + // Read starts on this contig and the alignment start is registered before this end point. + (read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 9466fdf75..48fd73e0b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -49,7 +49,7 @@ public class LinearMicroScheduler extends MicroScheduler { Accumulator accumulator = Accumulator.create(engine,walker); int counter = 0; - for (Shard shard : processingTracker.onlyOwned(shardStrategy, engine.getName())) { + for (Shard shard : shardStrategy ) { if ( shard == null ) // we ran out of shards that aren't owned break; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 23e5769f1..e731b9864 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -39,14 +39,10 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.threading.*; import javax.management.JMException; import javax.management.MBeanServer; import javax.management.ObjectName; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; import java.lang.management.ManagementFactory; import java.util.Collection; @@ -83,8 +79,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { private final MBeanServer mBeanServer; private final ObjectName mBeanName; - protected GenomeLocProcessingTracker processingTracker; - /** * MicroScheduler factory function. Create a microscheduler appropriate for reducing the * selected walker. @@ -98,11 +92,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse) { - if (engine.getArguments().processingTrackerFile != null) { - if ( walker instanceof ReadWalker ) - throw new UserException.BadArgumentValue("C", String.format("Distributed GATK processing not enabled for read walkers")); - } - if (walker instanceof TreeReducible && nThreadsToUse > 1) { if(walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); @@ -157,33 +146,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { catch (JMException ex) { throw new ReviewedStingException("Unable to register microscheduler with JMX", ex); } - - // - // create the processing tracker - // - if ( engine.getArguments().processingTrackerFile != null ) { - logger.warn("Distributed GATK is an experimental engine feature, and is likely to not work correctly or reliably."); - if ( engine.getArguments().restartProcessingTracker && engine.getArguments().processingTrackerFile.exists() ) { - engine.getArguments().processingTrackerFile.delete(); - logger.info("Deleting ProcessingTracker file " + engine.getArguments().processingTrackerFile); - } - - PrintStream statusStream = null; - if ( engine.getArguments().processingTrackerStatusFile != null ) { - try { - statusStream = new PrintStream(new FileOutputStream(engine.getArguments().processingTrackerStatusFile)); - } catch ( FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(engine.getArguments().processingTrackerStatusFile, e); - } - } - - ClosableReentrantLock lock = new SharedFileThreadSafeLock(engine.getArguments().processingTrackerFile, engine.getArguments().processTrackerID); - processingTracker = new FileBackedGenomeLocProcessingTracker(engine.getArguments().processingTrackerFile, engine.getGenomeLocParser(), lock, statusStream) ; - logger.info("Creating ProcessingTracker using shared file " + engine.getArguments().processingTrackerFile + " process.id = " + engine.getName() + " CID = " + engine.getArguments().processTrackerID); - } else { - // create a NoOp version that doesn't do anything but say "yes" - processingTracker = new NoOpGenomeLocProcessingTracker(); - } } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java index 227637761..bf3ce352a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadFilter.java @@ -2,10 +2,14 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.picard.filter.SamRecordFilter; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; /** * A SamRecordFilter that also depends on the header. */ +@DocumentedGATKFeature( + groupName = "Read filters", + summary = "GATK Engine arguments that filter or transfer incoming SAM/BAM data files" ) public abstract class ReadFilter implements SamRecordFilter { /** * Sets the header for use by this filter. diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index 1da03e9c2..ebb4cbe66 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -87,8 +87,8 @@ public class VCFWriterStorage implements Storage, VCFWriter { writer.writeHeader(stub.getVCFHeader()); } - public void add(VariantContext vc, byte ref) { - writer.add(vc, ref); + public void add(VariantContext vc) { + writer.add(vc); } /** @@ -117,7 +117,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { BasicFeatureSource source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec(), false); for ( VariantContext vc : source.iterator() ) { - target.writer.add(vc, vc.getReferenceBaseForIndel()); + target.writer.add(vc); } source.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index bb84f9457..7a110fde5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -192,8 +192,8 @@ public class VCFWriterStub implements Stub, VCFWriter { /** * @{inheritDoc} */ - public void add(VariantContext vc, byte ref) { - outputTracker.getStorage(this).add(vc,ref); + public void add(VariantContext vc) { + outputTracker.getStorage(this).add(vc); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 69c0b3e0a..acee1a6a3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -154,9 +154,13 @@ public class GATKRunReport { private long nReads; public enum PhoneHomeOption { + /** Disable phone home */ NO_ET, + /** Standard option. Writes to local repository if it can be found, or S3 otherwise */ STANDARD, + /** Force output to STDOUT. For debugging only */ STDOUT, + /** Force output to S3. For debugging only */ AWS_S3 // todo -- remove me -- really just for testing purposes } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 1d622e2c7..1c451575b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -112,24 +112,28 @@ public class VariantContextAdaptors { alleles.add(refAllele); // add all of the alt alleles + boolean sawNullAllele = false; for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) { if ( ! Allele.acceptableAlleleBases(alt) ) { //System.out.printf("Excluding dbsnp record %s%n", dbsnp); return null; } - alleles.add(Allele.create(alt, false)); + Allele altAllele = Allele.create(alt, false); + alleles.add(altAllele); + if ( altAllele.isNull() ) + sawNullAllele = true; } Map attributes = new HashMap(); attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); - if ( DbSNPHelper.isDeletion(dbsnp) ) { - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(ref.getBases()[index])); - } - Collection genotypes = null; - VariantContext vc = new VariantContext(name, dbsnp.getChr(),dbsnp.getStart() - (DbSNPHelper.isDeletion(dbsnp) ? 1 : 0),dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes); + + int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + Byte refBaseForIndel = new Byte(ref.getBases()[index]); + + Map genotypes = null; + VariantContext vc = new VariantContext(name, dbsnp.getChr(), dbsnp.getStart() - (sawNullAllele ? 1 : 0), dbsnp.getEnd(), alleles, genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, attributes, refBaseForIndel); return vc; } else return null; // can't handle anything else diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java deleted file mode 100755 index 6bba754be..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableCodec.java +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.Utils; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.StringTokenizer; - -public class AnnotatorInputTableCodec implements ReferenceDependentFeatureCodec { - - private static Logger logger = Logger.getLogger(AnnotatorInputTableCodec.class); - - public static final String DELIMITER = "\t"; - - private ArrayList header; - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Parses the header. - * - * @param reader - * - * @return The # of header lines for this file. - */ - public Object readHeader(LineReader reader) - { - int[] lineCounter = new int[1]; - try { - header = readHeader(reader, lineCounter); - } catch(IOException e) { - throw new IllegalArgumentException("Unable to read from file.", e); - } - return header; - } - - public Class getFeatureType() { - return AnnotatorInputTableFeature.class; - } - - @Override - public Feature decodeLoc(String line) { - StringTokenizer st = new StringTokenizer(line, DELIMITER); - if ( st.countTokens() < 1 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - - GenomeLoc loc; - String chr = st.nextToken(); - if ( chr.indexOf(":") != -1 ) { - loc = genomeLocParser.parseGenomeLoc(chr); - } else { - if ( st.countTokens() < 3 ) - throw new CodecLineParsingException("Couldn't parse GenomeLoc out of the following line because there aren't enough tokens.\nLine: " + line); - loc = genomeLocParser.createGenomeLoc(chr, Integer.valueOf(st.nextToken()), Integer.valueOf(st.nextToken())); - } - return new AnnotatorInputTableFeature(loc.getContig(), loc.getStart(), loc.getStop()); - } - - - /** - * Parses the line into an AnnotatorInputTableFeature object. - * - * @param line - */ - public AnnotatorInputTableFeature decode(String line) { - final ArrayList header = this.header; //optimization - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size()) { - throw new CodecLineParsingException(String.format("Encountered a line that has %d columns while the header has %d columns.\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - final AnnotatorInputTableFeature feature = new AnnotatorInputTableFeature(header); - for ( int i = 0; i < header.size(); i++ ) { - feature.putColumnValue(header.get(i), values.get(i)); - } - - GenomeLoc loc; - if ( values.get(0).indexOf(":") != -1 ) - loc = genomeLocParser.parseGenomeLoc(values.get(0)); - else - loc = genomeLocParser.createGenomeLoc(values.get(0), Integer.valueOf(values.get(1)), Integer.valueOf(values.get(2))); - - //parse the location - feature.setChr(loc.getContig()); - feature.setStart((int)loc.getStart()); - feature.setEnd((int)loc.getStop()); - - return feature; - } - - /** - * Returns the header. - * @param source - * @return - * @throws IOException - */ - public static ArrayList readHeader(final File source) throws IOException { - FileInputStream is = new FileInputStream(source); - try { - return readHeader(new AsciiLineReader(is), null); - } finally { - is.close(); - } - } - - - /** - * Returns the header, and also sets the 2nd arg to the number of lines in the header. - * @param source - * @param lineCounter An array of length 1 or null. If not null, array[0] will be set to the number of lines in the header. - * @return The header fields. - * @throws IOException - */ - private static ArrayList readHeader(final LineReader source, int[] lineCounter) throws IOException { - - ArrayList header = null; - int numLines = 0; - - //find the 1st line that's non-empty and not a comment - String line = null; - while( (line = source.readLine()) != null ) { - numLines++; - if ( line.trim().isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that we found the header - if ( header == null ) { - throw new IllegalArgumentException("No header in " + source + ". All lines are either comments or empty."); - } - - if(lineCounter != null) { - lineCounter[0] = numLines; - } - - logger.debug(String.format("Found header line containing %d columns:\n[%s]", header.size(), Utils.join("\t", header))); - - return header; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java deleted file mode 100755 index d12badd28..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/annotator/AnnotatorInputTableFeature.java +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.features.annotator; - -import org.broad.tribble.Feature; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * This class represents a single record in an AnnotatorInputTable. - */ -public class AnnotatorInputTableFeature implements Feature { - - private ArrayList columnNames; - private HashMap columnValues; //maps colum names to column values - - private String chr; - private int start; - private int end; - private String strRep = null; - - /** - * Constructor. - * @param chr The chromosome name. - * @param start The start position - * @param end The end position - */ - public AnnotatorInputTableFeature(String chr, int start, int end) { - this.chr = chr; - this.start = start; - this.end = end; - } - - - /** - * Constructor. - * @param columnNames The column names as parsed out of the file header. - */ - public AnnotatorInputTableFeature(ArrayList columnNames) { - this.columnNames = columnNames; - this.columnValues = new HashMap(); - } - - - - /** - * @return the list of column names from the file header. - */ - public ArrayList getHeader() { - return columnNames; - } - - - /** - * Returns the value of the given column. - * - * @param columnName The column name as it appears in the file header. - * @return The value - */ - public String getColumnValue(final String columnName) { - return columnValues.get(columnName); - } - - - public boolean containsColumnName(final String columnName) { - return columnValues.containsKey(columnName); - } - - - /** - * Sets the value for the given column. - * - * @param columnName The column name as it appears in the file header. - * @param value The value - * @return The existing value associated with the columnName, if there is one. - */ - protected String putColumnValue(final String columnName, final String value) { - return columnValues.put(columnName, value); - } - - /** - * @return all values in this line, hashed by their column names. - */ - public Map getColumnValues() { - return Collections.unmodifiableMap(columnValues); - } - - - public String getChr() { - return chr; - } - - public int getStart() { - return start; - } - - public int getEnd() { - return end; - } - - protected void setChr(String chr) { - this.chr = chr; - } - - protected void setStart(int start) { - this.start = start; - } - - protected void setEnd(int end) { - this.end = end; - } - - @Override - public String toString() { - if ( strRep == null ) { - StringBuilder sb = new StringBuilder(); - - for(String columnName : columnNames ) { - if ( sb.length() == 0 ) - sb.append("["); - else - sb.append(", "); - sb.append(columnName + "=" + columnValues.get(columnName)); - } - sb.append("]"); - - strRep = sb.toString(); - } - - return strRep; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java index 3201769e0..35b0f73c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/helpers/DbSNPHelper.java @@ -117,7 +117,11 @@ public class DbSNPHelper { } public static boolean isIndel(DbSNPFeature feature) { - return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || feature.getVariantType().contains("in-del"); + return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature); + } + + public static boolean isComplexIndel(DbSNPFeature feature) { + return feature.getVariantType().contains("in-del"); } public static boolean isHapmap(DbSNPFeature feature) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index f4c565318..dc3a617e7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -1,21 +1,23 @@ package org.broadinstitute.sting.gatk.report; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.io.*; +import java.util.List; import java.util.TreeMap; /** * Container class for GATK report tables */ public class GATKReport { - private TreeMap tables; + private TreeMap tables = new TreeMap(); /** * Create a new, empty GATKReport. */ public GATKReport() { - tables = new TreeMap(); } /** @@ -23,7 +25,7 @@ public class GATKReport { * @param filename the path to the file to load */ public GATKReport(String filename) { - loadReport(new File(filename)); + this(new File(filename)); } /** @@ -31,7 +33,6 @@ public class GATKReport { * @param file the file to load */ public GATKReport(File file) { - tables = new TreeMap(); loadReport(file); } @@ -46,11 +47,17 @@ public class GATKReport { GATKReportTable table = null; String[] header = null; int id = 0; + GATKReportVersion version = null; + List columnStarts = null; String line; while ( (line = reader.readLine()) != null ) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - line = line.replaceFirst("##:GATKReport.v0.1 ", ""); + + if (line.startsWith("##:GATKReport.v")) { + + version = GATKReportVersion.fromHeader(line); + + line = line.replaceFirst("##:GATKReport." + version.versionString + " ", ""); String[] pieces = line.split(" : "); String tableName = pieces[0]; @@ -58,14 +65,35 @@ public class GATKReport { addTable(tableName, tableDesc); table = getTable(tableName); + table.setVersion(version); header = null; - } else if ( line.isEmpty() ) { + columnStarts = null; + } else if ( line.trim().isEmpty() ) { // do nothing } else { if (table != null) { + + String[] splitLine; + + switch (version) { + case V0_1: + splitLine = TextFormattingUtils.splitWhiteSpace(line); + break; + + case V0_2: + if (header == null) { + columnStarts = TextFormattingUtils.getWordStarts(line); + } + splitLine = TextFormattingUtils.splitFixedWidth(line, columnStarts); + break; + + default: + throw new ReviewedStingException("GATK report version parsing not implemented for: " + line); + } + if (header == null) { - header = line.split("\\s+"); + header = splitLine; table.addPrimaryKey("id", false); @@ -75,10 +103,8 @@ public class GATKReport { id = 0; } else { - String[] entries = line.split("\\s+"); - for (int columnIndex = 0; columnIndex < header.length; columnIndex++) { - table.set(id, header[columnIndex], entries[columnIndex]); + table.set(id, header[columnIndex], splitLine[columnIndex]); } id++; @@ -100,7 +126,11 @@ public class GATKReport { * @param tableDescription the description of the table */ public void addTable(String tableName, String tableDescription) { - GATKReportTable table = new GATKReportTable(tableName, tableDescription); + addTable(tableName, tableDescription, true); + } + + public void addTable(String tableName, String tableDescription, boolean sortByPrimaryKey) { + GATKReportTable table = new GATKReportTable(tableName, tableDescription, sortByPrimaryKey); tables.put(tableName, table); } @@ -121,7 +151,10 @@ public class GATKReport { * @return the table object */ public GATKReportTable getTable(String tableName) { - return tables.get(tableName); + GATKReportTable table = tables.get(tableName); + if (table == null) + throw new ReviewedStingException("Table is not in GATKReport: " + tableName); + return table; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java index 440597754..1c46b3bac 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumn.java @@ -37,10 +37,10 @@ public class GATKReportColumn extends TreeMap { * tables, as the table gets written properly without having to waste storage for the unset elements (usually the zero * values) in the table. * - * @param primaryKey the primary key position in the column that should be set + * @param primaryKey the primary key position in the column that should be retrieved * @return the value at the specified position in the column, or the default value if the element is not set */ - public Object getWithoutSideEffects(Object primaryKey) { + private Object getWithoutSideEffects(Object primaryKey) { if (!this.containsKey(primaryKey)) { return defaultValue; } @@ -48,6 +48,16 @@ public class GATKReportColumn extends TreeMap { return this.get(primaryKey); } + /** + * Return an object from the column, but if it doesn't exist, return the default value. + * + * @param primaryKey the primary key position in the column that should be retrieved + * @return the string value at the specified position in the column, or the default value if the element is not set + */ + public String getStringValue(Object primaryKey) { + return toString(getWithoutSideEffects(primaryKey)); + } + /** * Return the displayable property of the column. If true, the column will be displayed in the final output. * If not, printing will be suppressed for the contents of the table. @@ -67,7 +77,7 @@ public class GATKReportColumn extends TreeMap { for (Object obj : this.values()) { if (obj != null) { - int width = obj.toString().length(); + int width = toString(obj).length(); if (width > maxWidth) { maxWidth = width; @@ -77,4 +87,23 @@ public class GATKReportColumn extends TreeMap { return maxWidth; } + + /** + * Returns a string version of the values. + * @param obj The object to convert to a string + * @return The string representation of the column + */ + private static String toString(Object obj) { + String value; + if (obj == null) { + value = "null"; + } else if (obj instanceof Float) { + value = String.format("%.8f", (Float) obj); + } else if (obj instanceof Double) { + value = String.format("%.8f", (Double) obj); + } else { + value = obj.toString(); + } + return value; + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java similarity index 50% rename from public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java rename to public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java index cfd75c41a..a33631c85 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportParserUnitTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java @@ -24,26 +24,32 @@ package org.broadinstitute.sting.gatk.report; -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.Test; +import java.util.*; -import java.io.File; +/** + * Tracks a linked list of GATKReportColumn in order by name. + */ +public class GATKReportColumns extends LinkedHashMap { + private List columnNames = new ArrayList(); -public class GATKReportParserUnitTest extends BaseTest { - @Test - public void testParse() throws Exception { - GATKReportParser parser = new GATKReportParser(); - parser.parse(new File(validationDataLocation + "exampleGATKReport.eval")); + /** + * Returns the column by index + * @param i the index + * @return The column + */ + public GATKReportColumn getByIndex(int i) { + return get(columnNames.get(i)); + } - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nProcessedLoci"), "100000"); - Assert.assertEquals(parser.getValue("CountVariants", "none.eval.none.all", "nNoCalls"), "99872"); + @Override + public GATKReportColumn remove(Object key) { + columnNames.remove(key); + return super.remove(key); + } - Assert.assertEquals(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC"), "2"); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2.bad", "AC")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics", "none.eval.none.novel.ac2", "AC.bad")); - Assert.assertNull(parser.getValue("SimpleMetricsByAC.metrics.bad", "none.eval.none.novel.ac2", "AC")); - - Assert.assertEquals(parser.getValue("ValidationReport", "none.eval.none.known", "sensitivity"), "NaN"); + @Override + public GATKReportColumn put(String key, GATKReportColumn value) { + columnNames.add(key); + return super.put(key, value); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java deleted file mode 100644 index 6915d5cb2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportParser.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -public class GATKReportParser { - private List tables = new ArrayList(); - - public void parse(File file) throws IOException { - InputStream stream = FileUtils.openInputStream(file); - try { - parse(stream); - } finally { - IOUtils.closeQuietly(stream); - } - } - - public void parse(InputStream input) throws IOException { - GATKReportTableParser table = null; - - for (String line: new XReadLines(input)) { - if (line.startsWith("##:GATKReport.v0.1 ")) { - table = newTableParser(line); - tables.add(table); - table.parse(line); - } else if (table != null) { - if (line.trim().length() == 0) - table = null; - else - table.parse(line); - } - } - } - - public String getValue(String tableName, String[] key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - public String getValue(String tableName, String key, String column) { - for (GATKReportTableParser table: tables) - if (table.getTableName().equals(tableName)) - return table.getValue(key, column); - return null; - } - - private GATKReportTableParser newTableParser(String header) { - return new GATKReportTableParser(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 0e503f92a..5d38295f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -1,11 +1,10 @@ package org.broadinstitute.sting.gatk.report; +import org.apache.commons.lang.ObjectUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.PrintStream; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.TreeSet; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -90,16 +89,20 @@ import java.util.regex.Pattern; * but at least the prototype contained herein works. * * @author Kiran Garimella + * @author Khalid Shakir */ public class GATKReportTable { + private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2; private String tableName; private String tableDescription; + private GATKReportVersion version = LATEST_REPORT_VERSION; private String primaryKeyName; - private TreeSet primaryKeyColumn; + private Collection primaryKeyColumn; private boolean primaryKeyDisplay; + private boolean sortByPrimaryKey = true; - private LinkedHashMap columns; + private GATKReportColumns columns; /** * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed @@ -114,6 +117,19 @@ public class GATKReportTable { return !m.find(); } + /** + * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed + * + * @param description the name of the table or column + * @return true if the name is valid, false if otherwise + */ + private boolean isValidDescription(String description) { + Pattern p = Pattern.compile("\\r|\\n"); + Matcher m = p.matcher(description); + + return !m.find(); + } + /** * Construct a new GATK report table with the specified name and description * @@ -121,14 +137,31 @@ public class GATKReportTable { * @param tableDescription the description of the table */ public GATKReportTable(String tableName, String tableDescription) { - if (!isValidName(tableName)) { + this(tableName, tableDescription, true); + } + + public GATKReportTable(String tableName, String tableDescription, boolean sortByPrimaryKey) { + if (!isValidName(tableName)) { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } + if (!isValidDescription(tableDescription)) { + throw new ReviewedStingException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); + } + this.tableName = tableName; this.tableDescription = tableDescription; + this.sortByPrimaryKey = sortByPrimaryKey; - columns = new LinkedHashMap(); + columns = new GATKReportColumns(); + } + + public GATKReportVersion getVersion() { + return version; + } + + protected void setVersion(GATKReportVersion version) { + this.version = version; } /** @@ -137,20 +170,14 @@ public class GATKReportTable { * @param primaryKeyName the name of the primary key column */ public void addPrimaryKey(String primaryKeyName) { - if (!isValidName(primaryKeyName)) { - throw new ReviewedStingException("Attempted to set a GATKReportTable primary key name of '" + primaryKeyName + "'. GATKReportTable primary key names must be purely alphanumeric - no spaces or special characters are allowed."); - } - - this.primaryKeyName = primaryKeyName; - - primaryKeyColumn = new TreeSet(); - primaryKeyDisplay = true; + addPrimaryKey(primaryKeyName, true); } /** * Add an optionally visible primary key column. This becomes the unique identifier for every column in the table, and will always be printed as the first column. * * @param primaryKeyName the name of the primary key column + * @param display should this primary key be displayed? */ public void addPrimaryKey(String primaryKeyName, boolean display) { if (!isValidName(primaryKeyName)) { @@ -159,10 +186,61 @@ public class GATKReportTable { this.primaryKeyName = primaryKeyName; - primaryKeyColumn = new TreeSet(); + primaryKeyColumn = sortByPrimaryKey ? new TreeSet() : new LinkedList(); primaryKeyDisplay = display; } + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or throws an exception. + */ + public Object getPrimaryKey(String dottedColumnValues) { + Object key = findPrimaryKey(dottedColumnValues); + if (key == null) + throw new ReviewedStingException("Attempted to get non-existent GATKReportTable key for values: " + dottedColumnValues); + return key; + } + + /** + * Returns true if there is at least on row with the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return true if there is at least one row matching the columns. + */ + public boolean containsPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues) != null; + } + + /** + * Returns the first primary key matching the dotted column values. + * Ex: dbsnp.eval.called.all.novel.all + * @param dottedColumnValues Period concatenated values. + * @return The first primary key matching the column values or null. + */ + private Object findPrimaryKey(String dottedColumnValues) { + return findPrimaryKey(dottedColumnValues.split("\\.")); + } + + /** + * Returns the first primary key matching the column values. + * Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" } + * @param columnValues column values. + * @return The first primary key matching the column values. + */ + private Object findPrimaryKey(Object[] columnValues) { + for (Object primaryKey : primaryKeyColumn) { + boolean matching = true; + for (int i = 0; matching && i < columnValues.length; i++) { + matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1)); + } + if (matching) + return primaryKey; + } + return null; + } + /** * Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set. * @@ -232,6 +310,17 @@ public class GATKReportTable { return columns.get(columnName).get(primaryKey); } + /** + * Get a value from the given position in the table + * + * @param primaryKey the primary key value + * @param columnIndex the index of the column + * @return the value stored at the specified position in the table + */ + private Object get(Object primaryKey, int columnIndex) { + return columns.getByIndex(columnIndex).get(primaryKey); + } + /** * Increment an element in the table. This implementation is awful - a functor would probably be better. * @@ -517,7 +606,7 @@ public class GATKReportTable { String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s"; // Emit the table definition - out.printf("##:GATKReport.v0.1 %s : %s%n", tableName, tableDescription); + out.printf("##:GATKReport.%s %s : %s%n", LATEST_REPORT_VERSION.versionString, tableName, tableDescription); // Emit the table header, taking into account the padding requirement if the primary key is a hidden column boolean needsPadding = false; @@ -547,22 +636,8 @@ public class GATKReportTable { for (String columnName : columns.keySet()) { if (columns.get(columnName).isDisplayable()) { - Object obj = columns.get(columnName).getWithoutSideEffects(primaryKey); - if (needsPadding) { out.printf(" "); } - - String value = "null"; - if (obj != null) { - if (obj instanceof Float) { - value = String.format("%.8f", (Float) obj); - } else if (obj instanceof Double) { - value = String.format("%.8f", (Double) obj); - } else { - value = obj.toString(); - } - } - - //out.printf(columnWidths.get(columnName), obj == null ? "null" : obj.toString()); + String value = columns.get(columnName).getStringValue(primaryKey); out.printf(columnWidths.get(columnName), value); needsPadding = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java deleted file mode 100644 index 6fd9f9627..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTableParser.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.report; - -import org.apache.commons.lang.StringUtils; - -import java.util.*; - -public class GATKReportTableParser { - private int lineNum = 0; - private String[] descriptions; - private Map headers = new HashMap(); - private List values = new ArrayList(); - - public void parse(String line) { - lineNum++; - switch (lineNum) { - case 1: - descriptions = parseLine(line); - case 2: - String[] columnHeaders = parseLine(line); - for (int i = 0; i < columnHeaders.length; i++) - headers.put(columnHeaders[i], i); - default: - values.add(parseLine(line)); - } - } - - public String getTableName() { - return descriptions[1]; - } - - public String getValue(String[] key, String column) { - if (!headers.containsKey(column)) - return null; - for (String[] row: values) - if (Arrays.equals(key, Arrays.copyOfRange(row, 1, key.length + 1))) - return row[headers.get(column)]; - return null; - } - - public String getValue(String key, String column) { - return getValue(key.split("\\."), column); - } - - private String generateKey(String[] row, int i) { - return StringUtils.join(row, ".", 0, i); - } - - private String[] parseLine(String line) { - return line.split(" +"); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java new file mode 100644 index 000000000..5f1159a43 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +public enum GATKReportVersion { + /** + * Differences between other versions: + * - Does not allow spaces in cells. + * - Mostly fixed width but has a bug where the string width of floating point + * values was not measured correctly leading to columns that aren't aligned + */ + V0_1("v0.1"), + + /** + * Differences between other versions: + * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". + * - Fixed width fixed for floating point values + */ + V0_2("v0.2"); + + public final String versionString; + + private GATKReportVersion(String versionString) { + this.versionString = versionString; + } + + @Override + public String toString() { + return versionString; + } + + /** + * Returns the GATK Report Version from the file header. + * @param header Header from the file starting with ##:GATKReport.v[version] + * @return The version as an enum. + */ + public static GATKReportVersion fromHeader(String header) { + if (header.startsWith("##:GATKReport.v0.1 ")) + return GATKReportVersion.V0_1; + + if (header.startsWith("##:GATKReport.v0.2 ")) + return GATKReportVersion.V0_2; + + throw new ReviewedStingException("Unknown GATK report version in header: " + header); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index a189c00b5..7e1dcd707 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -30,10 +30,16 @@ import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import java.io.File; +import java.util.Collection; +import java.util.Set; +import java.util.TreeSet; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting @@ -52,6 +58,13 @@ public class PrintReadsWalker extends ReadWalker { String platform = null; // E.g. ILLUMINA, 454 @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + public Set sampleFile = new TreeSet(); + @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames = new TreeSet(); + + private TreeSet samplesToChoose = new TreeSet(); + private boolean SAMPLES_SPECIFIED = false; /** * The initialize function. @@ -59,6 +72,20 @@ public class PrintReadsWalker extends ReadWalker { public void initialize() { if ( platform != null ) platform = platform.toUpperCase(); + + Collection samplesFromFile; + if (!sampleFile.isEmpty()) { + samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); + samplesToChoose.addAll(samplesFromFile); + } + + if (!sampleNames.isEmpty()) + samplesToChoose.addAll(sampleNames); + + if(!samplesToChoose.isEmpty()) { + SAMPLES_SPECIFIED = true; + } + } /** @@ -85,6 +112,14 @@ public class PrintReadsWalker extends ReadWalker { if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform)) return false; } + if (SAMPLES_SPECIFIED ) { + // user specified samples to select + // todo - should be case-agnostic but for simplicity and speed this is ignored. + // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup. + if (!samplesToChoose.contains(read.getReadGroup().getSample())) + return false; + } + // check if we've reached the output limit if ( nReadsToPrint == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java index ff3b6d82f..5f11686a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadFilters.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.picard.filter.SamRecordFilter; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import java.lang.annotation.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 384742302..9e261a0b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -26,11 +26,14 @@ package org.broadinstitute.sting.gatk.walkers; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.filters.MalformedReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.GenericDocumentationHandler; import java.util.List; @@ -44,6 +47,10 @@ import java.util.List; @ReadFilters(MalformedReadFilter.class) @PartitionBy(PartitionType.NONE) @BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@DocumentedGATKFeature( + groupName = "GATK walkers", + summary = "General tools available for running on the command line as part of the GATK package", + extraDocs = {CommandLineGATK.class}) public abstract class Walker { final protected static Logger logger = Logger.getLogger(Walker.class); private GenomeAnalysisEngine toolkit; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 3144098a8..784927ab4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -42,7 +42,7 @@ import java.util.List; import java.util.Map; -public class AlleleBalance implements InfoFieldAnnotation { +public class AlleleBalance extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index a99f87a70..f70a87dc5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -15,7 +15,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class AlleleBalanceBySample implements GenotypeAnnotation, ExperimentalAnnotation { +public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) { Double ratio = annotateSNP(stratifiedContext, vc, g); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java index 6c14e7445..dc41dbc81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java @@ -8,7 +8,7 @@ import java.util.Map; -public abstract class AnnotationByDepth implements InfoFieldAnnotation { +public abstract class AnnotationByDepth extends InfoFieldAnnotation { protected int annotationByVariantDepth(final Map genotypes, Map stratifiedContexts) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index 66416ce11..7cd159c5d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -46,7 +46,7 @@ import java.util.List; import java.util.Map; -public class BaseCounts implements InfoFieldAnnotation { +public class BaseCounts extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 74f7f9d80..9b30079d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -43,7 +43,7 @@ import java.util.List; import java.util.Map; -public class ChromosomeCounts implements InfoFieldAnnotation, StandardAnnotation { +public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation { private String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; private VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed"), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index c384e0d09..d8907c57f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class DepthOfCoverage implements InfoFieldAnnotation, StandardAnnotation { +public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index e3e8bc258..20513421d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -22,7 +22,7 @@ import java.util.List; import java.util.Map; -public class DepthPerAlleleBySample implements GenotypeAnnotation, StandardAnnotation { +public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { private static String REF_ALLELE = "REF"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 97ed221e7..e71febece 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -42,7 +42,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class FisherStrand implements InfoFieldAnnotation, StandardAnnotation { +public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 48677bbe5..588d3e98a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class GCContent implements InfoFieldAnnotation, ExperimentalAnnotation { +public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { double content = computeGCContent(ref); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java index cca0ad4bc..862e12f7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GLstats.java @@ -23,7 +23,7 @@ import java.util.Map; */ // A set of annotations calculated directly from the GLs -public class GLstats implements InfoFieldAnnotation, StandardAnnotation { +public class GLstats extends InfoFieldAnnotation implements StandardAnnotation { private static final int MIN_SAMPLES = 10; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index b175579f1..2196de389 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -48,7 +48,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -public class HaplotypeScore implements InfoFieldAnnotation, StandardAnnotation { +public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index d86728d5e..2d9424e98 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -18,7 +18,7 @@ import java.util.List; import java.util.Map; -public class HardyWeinberg implements InfoFieldAnnotation, WorkInProgressAnnotation { +public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation { private static final int MIN_SAMPLES = 10; private static final int MIN_GENOTYPE_QUALITY = 10; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 02efd854c..870e9992b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class HomopolymerRun implements InfoFieldAnnotation, StandardAnnotation { +public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnotation { private boolean ANNOTATE_INDELS = true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index 12b48473d..b1c16ba0d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -19,16 +19,32 @@ import java.util.*; * Time: 11:47:33 AM * To change this template use File | Settings | File Templates. */ -public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation { +public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { int run; - if ( vc.isIndel() && vc.isBiallelic() ) { + if (vc.isMixed()) { + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%s", "MIXED")); + return map; + + } + else if ( vc.isIndel() ) { String type=""; - ArrayList inds = IndelUtils.findEventClassificationIndex(vc, ref); - for (int k : inds) { - type = type+ IndelUtils.getIndelClassificationName(k)+"."; + if (!vc.isBiallelic()) + type = "MULTIALLELIC_INDEL"; + else { + if (vc.isInsertion()) + type = "INS."; + else if (vc.isDeletion()) + type = "DEL."; + else + type = "OTHER."; + ArrayList inds = IndelUtils.findEventClassificationIndex(vc, ref); + for (int k : inds) { + type = type+ IndelUtils.getIndelClassificationName(k)+"."; + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%s", type)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 1d999c531..5de9aaa3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class LowMQ implements InfoFieldAnnotation { +public class LowMQ extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index f240d02bc..60bfe945f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -18,7 +18,7 @@ import java.util.List; import java.util.Map; -public class MappingQualityZero implements InfoFieldAnnotation, StandardAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 0ca53adf2..f2b7b72b9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -49,7 +49,7 @@ import java.util.Map; * Time: 6:46:25 PM * To change this template use File | Settings | File Templates. */ -public class MappingQualityZeroBySample implements GenotypeAnnotation { +public class MappingQualityZeroBySample extends GenotypeAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) { if ( g == null || !g.isCalled() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 08a25a7e3..3a6c9dce9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -18,7 +18,7 @@ import java.util.Map; -public class MappingQualityZeroFraction implements InfoFieldAnnotation, ExperimentalAnnotation { +public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index 1c70a1b33..9f67acf65 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -21,7 +21,7 @@ import java.util.Map; * Date: 5/16/11 */ -public class NBaseCount implements InfoFieldAnnotation { +public class NBaseCount extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 2175d39e6..20bee9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class QualByDepth extends AnnotationByDepth implements InfoFieldAnnotation, StandardAnnotation { +public class QualByDepth extends AnnotationByDepth implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index d52f07b58..d1d9871e7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -20,7 +20,7 @@ import java.util.List; import java.util.Map; -public class RMSMappingQuality implements InfoFieldAnnotation, StandardAnnotation { +public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 5466828f6..643056c1d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -21,7 +21,7 @@ import java.util.Map; -public abstract class RankSumTest implements InfoFieldAnnotation, StandardAnnotation { +public abstract class RankSumTest extends InfoFieldAnnotation implements StandardAnnotation { static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index c56e2622d..f3e99235a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -52,7 +52,7 @@ import java.util.Map; * Time: 3:59:27 PM * To change this template use File | Settings | File Templates. */ -public class ReadDepthAndAllelicFractionBySample implements GenotypeAnnotation { +public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { private static String REF_ALLELE = "REF"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ff9092a71..3712ca8ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -41,7 +41,7 @@ import java.util.List; import java.util.Map; -public class SampleList implements InfoFieldAnnotation { +public class SampleList extends InfoFieldAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( vc.isMonomorphic() || !vc.hasGenotypes() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index a4668eeb6..332b0226b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -16,7 +16,7 @@ import java.util.List; import java.util.Map; -public class SpanningDeletions implements InfoFieldAnnotation, StandardAnnotation { +public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index b46d82d8b..626142cd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -24,7 +24,7 @@ import java.util.Map; * Time: 3:14 PM * To change this template use File | Settings | File Templates. */ -public class TechnologyComposition implements ExperimentalAnnotation,InfoFieldAnnotation { +public class TechnologyComposition extends InfoFieldAnnotation implements ExperimentalAnnotation { private String nSLX = "NumSLX"; private String n454 ="Num454"; private String nSolid = "NumSOLiD"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index acbeee3b2..d39912ed2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -219,18 +219,18 @@ public class VariantAnnotator extends RodWalker { if ( stratifiedContexts != null ) { annotatedVCs = new ArrayList(VCs.size()); for ( VariantContext vc : VCs ) - annotatedVCs.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); + annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } } if ( ! indelsOnly ) { for ( VariantContext annotatedVC : annotatedVCs ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); } else { // check to see if the buffered context is different (in location) this context if ( indelBufferContext != null && ! VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),indelBufferContext.iterator().next()).equals(VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),annotatedVCs.iterator().next())) ) { for ( VariantContext annotatedVC : indelBufferContext ) - vcfWriter.add(annotatedVC, ref.getBase()); + vcfWriter.add(annotatedVC); indelBufferContext = annotatedVCs; } else { indelBufferContext = annotatedVCs; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index fdf498a3d..0d1b21499 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -31,8 +31,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.GenomicAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.JoinTable; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -45,7 +43,6 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; -import java.util.Map.Entry; public class VariantAnnotatorEngine { @@ -58,19 +55,6 @@ public class VariantAnnotatorEngine { private HashMap dbAnnotations = new HashMap(); - // command-line option from GenomicAnnotator. - private Map> requestedColumnsMap; - - // command-line option from GenomicAnnotator. - private boolean oneToMany; - - // command-line option from GenomicAnnotator. - private List joinTables; - - // used by GenomicAnnotator. Maps binding name to number of output VCF records - // annotated with records from the input table with this binding name. Only used for - // printing out stats at the end. - private Map inputTableHitCounter = new HashMap(); private static class VAExpression { public String fullName, bindingName, fieldName; @@ -140,7 +124,7 @@ public class VariantAnnotatorEngine { return descriptions; } - public Collection annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); @@ -150,42 +134,18 @@ public class VariantAnnotatorEngine { // annotate expressions where available annotateExpressions(tracker, ref, infoAnnotations); - // process the info field - List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file - infoAnnotationOutputsList.add(new LinkedHashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line. - infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in - // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) - { + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); - if ( annotationsFromCurrentType == null ) { - continue; - } - - if(annotationType instanceof GenomicAnnotation) - { - infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType ); - } - else - { - // add the annotations to each output line. - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - infoAnnotationOutput.putAll(annotationsFromCurrentType); - } - } + if ( annotationsFromCurrentType != null ) + infoAnnotations.putAll(annotationsFromCurrentType); } - // annotate genotypes - Map genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc); + // generate a new annotated VC + final VariantContext annotatedVC = VariantContext.modifyAttributes(vc, infoAnnotations); - // create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList - Collection returnValue = new LinkedList(); - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) ); - } - - return returnValue; + // annotate genotypes, creating another new VC in the process + return VariantContext.modifyGenotypes(annotatedVC, annotateGenotypes(tracker, ref, stratifiedContexts, vc)); } private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { @@ -251,6 +211,9 @@ public class VariantAnnotatorEngine { return genotypes; } + +/* + // Finish processing data from GenomicAnnotation. private List> processGenomicAnnotation( List> infoAnnotationOutputsList, Map annotationsForCurrentLocusFromAllAnnotatorInputTables) { @@ -403,12 +366,14 @@ public class VariantAnnotatorEngine { incrementStatsCounter(bindingName, infoAnnotationOutputsList.size()); } - /** + */ +/** * Records statistics that will be printed when GenomicAnnotator finishes. * * @param bindingName The table from which annotations were gotten * @param numNewRecords The number of new output VCF records created with annotations from this table - */ + *//* + private void incrementStatsCounter( final String bindingName, int numNewRecords) { //record some stats - there were infoAnnotationOutputsList.size() output VCF records annotated with data from the 'bindingName' input table. Integer counter = inputTableHitCounter.get(bindingName); @@ -453,13 +418,15 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Records statistics for the explodeInfoAnnotationOutputsList(..) calculation. * @param bindingName The table from which annotations were gotten * @param numNewVCFRecordsAnnotatedWithBindingNameData The number of new output VCF records created with annotations from this table * @param infoAnnotationOutputsList output list * @param matchingRecordsSize matching records size - */ + *//* + private void recordStats( final String bindingName, int numNewVCFRecordsAnnotatedWithBindingNameData, final List> infoAnnotationOutputsList, int matchingRecordsSize ) { //update stats for the 'bindingName' table @@ -509,13 +476,14 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Determines whether to exclude the given column from the annotations. * @param key The fully qualified columnName * @return Whether the -S arg specifies that this column should be included in the annotations. * - * TODO this function can be optimized through memoization - */ + *//* + private boolean isKeyFilteredOutBySelectArg(String key) { for(final String bindingName : requestedColumnsMap.keySet()) { @@ -536,10 +504,8 @@ public class VariantAnnotatorEngine { return false; //the -S arg doesn't have anything with the same binding name as this key, so the user implicitly requested this key } - - - - /** + */ +/** * Determines how the engine will handle the case where multiple records in a ROD file * overlap a particular single locus. If oneToMany is set to true, the output will be * one-to-many, so that each locus in the input VCF file could result in multiple @@ -551,18 +517,21 @@ public class VariantAnnotatorEngine { * See class-level comments for more details. * * @param oneToMany true if we should break out from one to many - */ + *//* + public void setOneToMany(boolean oneToMany) { this.oneToMany = oneToMany; } - /** + */ +/** * Sets the columns that will be used for the info annotation field. * Column names should be of the form bindingName.columnName (eg. dbsnp.avHet). * * @param columns An array of strings where each string is a comma-separated list * of columnNames (eg ["dbsnp.avHet,dbsnp.valid", "file2.col1,file3.col1"] ). - */ + *//* + public void setRequestedColumns(String[] columns) { if(columns == null) { throw new IllegalArgumentException("columns arg is null. Please check the -s command-line arg."); @@ -574,17 +543,20 @@ public class VariantAnnotatorEngine { } - /** + */ +/** * Passes in a pointer to the JoinTables. * * @param joinTables The list of JoinTables. There should be one JoinTable object for each -J arg. - */ + *//* + public void setJoinTables(List joinTables) { this.joinTables = joinTables; } - /** + */ +/** * Parses the columns arg and returns a Map of columns hashed by their binding name. * For example: * The command line: @@ -604,7 +576,8 @@ public class VariantAnnotatorEngine { * @param columnsArg The -s command line arg value. * * @return Map representing a parsed version of this arg - see above. - */ + *//* + private static Map> parseColumnsArg(String[] columnsArg) { Map> result = new HashMap>(); @@ -635,5 +608,6 @@ public class VariantAnnotatorEngine { return Collections.unmodifiableMap(inputTableHitCounter); } +*/ } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java deleted file mode 100644 index 05c1b3c52..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotation.java +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.*; -import java.util.Map.Entry; - -/** - * This plugin for {@link VariantAnnotatorEngine} serves as the core - * of the {@link GenomicAnnotator}. It finds all records in the -B input files - * that match the given variant's position and, optionally, the variant's reference and alternate alleles. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class GenomicAnnotation implements InfoFieldAnnotation { - - public static final String CHR_COLUMN = "chr"; - public static final String START_COLUMN = "start"; - public static final String END_COLUMN = "end"; - public static final String HAPLOTYPE_REFERENCE_COLUMN = "haplotypeReference"; - public static final String HAPLOTYPE_ALTERNATE_COLUMN = "haplotypeAlternate"; - - public static final String NUM_MATCHES_SPECIAL_INFO_FIELD = "numMatchingRecords"; - - /** Characters that aren't allowed within VCF info field key-value pairs */ - public static final char[] ILLEGAL_INFO_FIELD_VALUES = { ' ', '=', ';' }; - /** Replacement for each character in ILLEGAL_INFO_FIELD_VALUES */ - public static final char[] ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES = { '_', '-', '!' }; - - - private void modifyAnnotationsForIndels(VariantContext vc, String featureName, Map annotationsForRecord) { - String inCodingRegionKey = featureName + ".inCodingRegion"; - String referenceCodonKey = featureName + ".referenceCodon"; - String variantCodonKey = featureName + ".variantCodon"; - String codingCoordStrKey = featureName + ".codingCoordStr"; - String proteinCoordStrKey = featureName + ".proteinCoordStr"; - String haplotypeReferenceKey = featureName + "." + HAPLOTYPE_REFERENCE_COLUMN; - String haplotypeAlternateKey = featureName + "." + HAPLOTYPE_ALTERNATE_COLUMN; - String functionalClassKey = featureName + ".functionalClass"; - String startKey = featureName + "." + START_COLUMN; - String endKey = featureName + "." + END_COLUMN; - String referenceAAKey = featureName + ".referenceAA"; - String variantAAKey = featureName + ".variantAA"; - String changesAAKey = featureName + ".changesAA"; - - annotationsForRecord.put(variantCodonKey, "unknown"); - annotationsForRecord.put(codingCoordStrKey, "unknown"); - annotationsForRecord.put(proteinCoordStrKey, "unknown"); - annotationsForRecord.put(referenceAAKey, "unknown"); - annotationsForRecord.put(variantAAKey, "unknown"); - - String refAllele = vc.getReference().getDisplayString(); - if (refAllele.length() == 0) { refAllele = "-"; } - - String altAllele = vc.getAlternateAllele(0).toString(); - if (altAllele.length() == 0) { altAllele = "-"; } - - annotationsForRecord.put(haplotypeReferenceKey, refAllele); - annotationsForRecord.put(haplotypeAlternateKey, altAllele); - annotationsForRecord.put(startKey, String.format("%d", vc.getStart())); - annotationsForRecord.put(endKey, String.format("%d", vc.getEnd())); - - boolean isCodingRegion = annotationsForRecord.containsKey(inCodingRegionKey) && annotationsForRecord.get(inCodingRegionKey).equalsIgnoreCase("true") ? true : false; - boolean isFrameshift = (vc.getIndelLengths().get(0) % 3 == 0) ? false : true; - - String functionalClass; - if (isCodingRegion) { - functionalClass = isFrameshift ? "frameshift" : "inframe"; - annotationsForRecord.put(changesAAKey, "true"); - } else { - functionalClass = "noncoding"; - } - - annotationsForRecord.put(functionalClassKey, functionalClass); - } - - /** - * For each -B input file, for each record which overlaps the current locus, generates a - * set of annotations of the form: - * - * bindingName.columnName1=columnValue, bindingName.columnName2=columnValue2, etc. - * - * For example: dbSNP.avHet=0.7, dbSNP.ref_allele=A, etc. - * - * @return The following is an explanation of this method's return value: - * - * The annotations from a matching in a particular file are stored in a Map - * where the key is bindingName.columnName and the value is the columnValue. - * Since a single input file can have multiple records that overlap the current - * locus (eg. dbSNP can have multiple entries for the same genomic position), a different - * Map is created for each matching record in a particular file. - * The set of matching records for each file is then represented as a List> - * - * The return value of this method is a Map of the form: - * rodName1 -> List> - * rodName2 -> List> - * rodName3 -> List> - * ... - * Where the rodNames are the -B binding names for each file that were specified on the command line (eg. -B bindingName,AnnotatorInputTable,/path/to/file). - * - * NOTE: The lists (List>) are guaranteed to have size > 0 - * because a rodName -> List> entry will only - * be created in Map if the List has at least one element. - */ - public Map annotate(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - final VariantContext vc) { - - //iterate over each record that overlaps the current locus, and, if it passes certain filters, - //add its values to the list of annotations for this locus. - final Map annotations = new HashMap(); - for(final GATKFeature gatkFeature : tracker.getAllRods()) - { - final String name = gatkFeature.getName(); - if( name.equals("variant") || name.equals("interval") ) { - continue; - } - - if( ! (gatkFeature.getUnderlyingObject() instanceof AnnotatorInputTableFeature) ) { - continue; //GenericAnnotation only works with TabularRODs because it needs to be able to select individual columns. - } - - final Map annotationsForRecord = convertRecordToAnnotations( gatkFeature.getName(), ((AnnotatorInputTableFeature) gatkFeature.getUnderlyingObject()).getColumnValues()); - - //If this record contains the HAPLOTYPE_REFERENCE_COLUMN and/or HAPLOTYPE_ALTERNATE_COLUMN, check whether the - //alleles specified match the the variant's reference allele and alternate allele. - //If they don't match, this record will be skipped, and its values will not be used for annotations. - // - //If one of these columns doesn't exist in the current rod, or if its value is * (star), then this is treated as an automatic match. - //Otherwise, the HAPLOTYPE_REFERENCE_COLUMN is only considered to be matching the variant's reference if the string values of the two - //are exactly equal (case-insensitive). - - //The HAPLOTYPE_REFERENCE_COLUMN matches the variant's reference allele based on a case-insensitive string comparison. - //The HAPLOTYPE_ALTERNATE_COLUMN can optionally list more than allele separated by one of these chars: ,\/:| - // only check this value for SNPs - String hapAltValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_ALTERNATE_COLUMN) ) : null; - if ( hapAltValue != null && !hapAltValue.equals("*") ) { - Set alternateAlleles = vc.getAlternateAlleles(); - //if(alternateAlleles.isEmpty()) { - //handle a site that has been called monomorphic reference - //alternateAlleles.add(vc.getReference()); - //continue; //TODO If this site is monomorphic in the VC, and the current record specifies a particular alternate allele, skip this record. Right? - //} else - if(alternateAlleles.size() > 1) { - throw new UserException.MalformedFile("File associated with " + vc.getSource() + " contains record [" + vc + "] contains " + alternateAlleles.size() + " alternate alleles. GenomicAnnotion currently only supports annotating 1 alternate allele."); - } - - Allele vcAlt; - if(alternateAlleles.isEmpty()) { - vcAlt = vc.getReference(); - } else { - vcAlt = alternateAlleles.iterator().next(); - } - - boolean matchFound = false; - for(String hapAlt : hapAltValue.split("[,\\\\/:|]")) { - - if(!hapAlt.isEmpty() && vcAlt.basesMatch(hapAlt)) { - matchFound = true; - break; - } - } - if(!matchFound) { - continue; //skip record - none of its alternate alleles match the variant's alternate allele - } - } - - // only check this value for SNPs - String hapRefValue = vc.isSNP() ? annotationsForRecord.get( generateInfoFieldKey(name, HAPLOTYPE_REFERENCE_COLUMN) ) : null; - if(hapRefValue != null) - { - hapRefValue = hapRefValue.trim(); - if(!hapRefValue.equals("*")) - { - //match against hapolotypeReference. - Allele vcRef = vc.getReference(); - if(!vcRef.basesMatch(hapRefValue)) { - continue; //skip record - } - } - } - - if (vc.isIndel()) { - modifyAnnotationsForIndels(vc, name, annotationsForRecord); - } - - //filters passed, so add this record. - List> listOfMatchingRecords = (List>) annotations.get( name ); - if(listOfMatchingRecords == null) { - listOfMatchingRecords = new LinkedList>(); - listOfMatchingRecords.add( annotationsForRecord ); - annotations.put(name, listOfMatchingRecords); - } else { - listOfMatchingRecords.add( annotationsForRecord ); - } - } - - return annotations; - } - - - - - /** - * Converts the given record to a set of key-value pairs of the form: - * bindingName.columnName1=column1Value, bindingName.columnName2=column2Value - * (eg. dbSNP.avHet=0.7, dbSNP.ref_allele=A) - * - * @param record AnnotatorInputTableFeature corresponding to one record in one -B input file. - * @param bindingName The binding name of the given AnnotatorInputTableFeature. - * @return The map of columnName -> columnValue pairs. - */ - public static Map convertRecordToAnnotations( String bindingName, Map record) { - final Map result = new HashMap(); - - for(final Entry entry : record.entrySet()) { - final String value = entry.getValue(); - if(!value.trim().isEmpty()) { - result.put( generateInfoFieldKey(bindingName, entry.getKey()), scrubInfoFieldValue(entry.getValue())); - } - } - - return result; - } - - /** - * Combines the 2 values into a full key. - * @param rodBindingName -B name - * @param columnName column name - * @return info field key - */ - public static String generateInfoFieldKey(String rodBindingName, String columnName ) { - return rodBindingName + '.' + columnName; - } - - - - /** - * Replaces any characters that are not allowed in the info field of a VCF file. - * - * @param value info field value - * @return the value with any illegal characters replaced by legal ones. - */ - private static String scrubInfoFieldValue(String value) { - for(int i = 0; i < GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES.length; i++) { - value = value.replace(GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUES[i], GenomicAnnotation.ILLEGAL_INFO_FIELD_VALUE_SUBSTITUTES[i]); - } - - return value; - } - - - - public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine("GenericAnnotation", 1, VCFHeaderLineType.Integer, "For each variant in the 'variants' ROD, finds all entries in the other -B files that overlap the variant's position.")); - } - - public List getKeyNames() { - return Arrays.asList("GenericAnnotation"); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java deleted file mode 100644 index b42310780..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotator.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.File; -import java.io.IOException; -import java.util.*; -import java.util.Map.Entry; - -/** - * Annotates variant calls with information from user-specified tabular files. - * - * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class)) -@By(DataSource.REFERENCE) -public class GenomicAnnotator extends RodWalker implements TreeReducible { - - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter vcfWriter = null; - - @Argument(fullName="vcfOutput", shortName="vcf", doc="Please use --out instead", required=false) - @Deprecated - protected String oldOutArg; - - @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false) - protected String sampleName = null; - - @Argument(fullName="select", shortName="s", doc="Optionally specifies which subset of columns from which -B inputs should be used for annotations. For example, -B:mydbsnp,AnnotatorInputTable /path/to/mydbsnp.txt -B:mytable,AnnotatorInputTable /path/mytable.txt -s mydbsnp.avHet,mydbsnp.name,mytable.column3 will cause annotations to only be generated from the 3 columns specified using -s.", required=false) - protected String[] SELECT_COLUMNS = {}; - - @Argument(fullName="join", shortName="J", doc="Optionally specifies a file and column within that file that should be LEFT-JOIN'ed to a column in a previously-specified file. The file provided to -J must be tab-delimited, with the first non-comment/non-empty line containing column names. (example: -B:name,AnnotatorInputTable /path/to/file1 -J name2,/path/to/file2,name.columnName=name2.columnName2 - this will join the table in file2 to the table in file1) ", required=false) - protected String[] JOIN_ARGS = {}; - - @Argument(fullName="oneToMany", shortName="m", doc="If more than one record from the same file matches a particular locus (for example, multiple dbSNP records with the same position), create multiple entries in the ouptut VCF file - one for each match. If a particular tabular file has J matches, and another tabular file has K matches for a given locus, then J*K output VCF records will be generated - one for each pair of K, J. If this flag is not provided, the multiple records are still generated, but they are stored in the INFO field of a single output VCF record, with their annotation keys differentiated by appending '_i' with i varying from 1 to K*J. ", required=false) - protected Boolean ONE_TO_MANY = false; - - @Argument(fullName="maxJoinTableSize", shortName="maxJoin", doc="The maximum allowed size (i.e. number of rows) for a table provided with the -J argument", required=false) - protected Integer MAX_JOIN_TABLE_SIZE = 500000; - - @Argument(fullName="ignoreFilteredSites", shortName="noFilt", doc="If specified, don't annotate sites marked as filtered out") - protected Boolean IGNORE_FILTERED_SITES = false; - - private VariantAnnotatorEngine engine; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //read all ROD file headers and construct a set of all column names to be used for validation of command-line args - final Set allFullyQualifiedColumnNames = new LinkedHashSet(); - final Set allBindingNames = new LinkedHashSet(); - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(! ds.getType().equals(AnnotatorInputTableCodec.class)) { - continue; //skip all non-AnnotatorInputTable files. - } - final String bindingName = ds.getName(); - File file = ds.getFile(); - allBindingNames.add(bindingName); - try { - final ArrayList header = AnnotatorInputTableCodec.readHeader(file); - for(String columnName : header) { - allFullyQualifiedColumnNames.add(bindingName + "." + columnName); - } - } catch(IOException e) { - throw new UserException.CouldNotReadInputFile(file, "Failed when attempting to read file header. ", e); - } - } - - //parse the JOIN_COLUMNS args, read in the specified files, and validate column names in the = relation. This end result of this loop is to populate the List of joinTables with one entry per -J arg. - final List joinTables = new LinkedList(); - for(String joinArg : JOIN_ARGS) { - - //parse the tokens - final String[] arg = joinArg.split(","); - if(arg.length != 3) { - throw new UserException.BadArgumentValue("-J", "The following -J arg: \"" + joinArg + "\" must contain 3 comma-separated values. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - final String bindingName = arg[0]; - final String filename = arg[1]; - final String columnsToJoin = arg[2]; - - if(allBindingNames.contains(bindingName)) { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" has already been used in another binding."); - } - - String[] splitOnEquals = columnsToJoin.split("=+"); - if(splitOnEquals.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - String[] splitOnDot1 = splitOnEquals[0].split("\\."); - String[] splitOnDot2 = splitOnEquals[1].split("\\."); - if(splitOnDot1.length != 2 || splitOnDot2.length != 2) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" must fully specify the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - final String bindingName1 = splitOnDot1[0]; - final String columnName1 = splitOnDot1[1]; - final String bindingName2 = splitOnDot2[0]; - final String columnName2 = splitOnDot2[1]; - - //figure out which of the 2 binding names within the = relation matches the -J bindingName - final String localBindingName = bindingName; //alias - final String localColumnName; - final String externalBindingName; - final String externalColumnName; - if(bindingName1.equals(bindingName)) { - localColumnName = columnName1; - externalBindingName = bindingName2; - externalColumnName = columnName2; - } else if(bindingName2.equals(bindingName)) { - localColumnName = columnName2; - externalBindingName = bindingName1; - externalColumnName = columnName1; - } else { - throw new UserException.BadArgumentValue("-J", "The name \"" + bindingName + "\" in the -J arg: \"" + joinArg + "\" must be specified in one the columns to join on. (ex: -J name,/path/to/file,name.columnName=name2.columnName2)"); - } - - //validate externalColumnName - final String fullyQualifiedExternalColumnName = externalBindingName + '.' + externalColumnName; - if( !allFullyQualifiedColumnNames.contains(fullyQualifiedExternalColumnName) ) { - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + fullyQualifiedExternalColumnName + "\""); - } - - //read in the file contents into a JoinTable object - final JoinTable joinTable = new JoinTable(MAX_JOIN_TABLE_SIZE); - joinTable.parseFromFile(filename, localBindingName, localColumnName, externalBindingName, externalColumnName); - joinTables.add(joinTable); - - //validate localColumnName, and add all column names in this file to the list of allFullyQualifiedColumnNames so that they can be referenced from subsequent -J args. - final List columnNames = joinTable.getColumnNames(); - final List fullyQualifiedColumnNames = new LinkedList(); - boolean found = false; - for ( String columnName : columnNames ) { - if ( columnName.equals(localColumnName) ) - found = true; - fullyQualifiedColumnNames.add(localBindingName + '.' + columnName); - } - if ( !found ) - throw new UserException.BadArgumentValue("-J", "The -J arg: \"" + joinArg + "\" specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - - allFullyQualifiedColumnNames.addAll(fullyQualifiedColumnNames); - } - - //parse the SELECT_COLUMNS arg and validate the column names - List parsedSelectColumns = new LinkedList(); - for ( String token : SELECT_COLUMNS ) - parsedSelectColumns.addAll(Arrays.asList(token.split(","))); - SELECT_COLUMNS = parsedSelectColumns.toArray(SELECT_COLUMNS); - - for ( String columnName : SELECT_COLUMNS ) { - if ( !allFullyQualifiedColumnNames.contains(columnName) ) - throw new UserException.BadArgumentValue("-s", "The column name '" + columnName + "' provided to -s doesn't match any of the column names in any of the -B files. Here is the list of available column names: " + allFullyQualifiedColumnNames); - } - - //instantiate the VariantAnnotatorEngine - ArrayList annotationsToUse = new ArrayList(); - annotationsToUse.add("GenomicAnnotation"); - engine = new VariantAnnotatorEngine(getToolkit(), new ArrayList(), annotationsToUse); - engine.setOneToMany(ONE_TO_MANY); - engine.setRequestedColumns(SELECT_COLUMNS); - engine.setJoinTables(joinTables); - - // set up the header fields - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), Arrays.asList("variant"))); - hInfo.addAll(engine.getVCFAnnotationDescriptions()); - - Set rodName = new HashSet(); - rodName.add("variant"); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); - VCFHeader vcfHeader = new VCFHeader(hInfo, samples); - vcfWriter.writeHeader(vcfHeader); - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** - * We want reads that span deletions - * - * @return true - */ - public boolean includeReadsWithDeletionAtLoci() { return true; } - - /** - * For each site of interest, annotate based on the requested annotation types - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - Set results = new LinkedHashSet(); - for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) { - if ( (vc.isFiltered() && IGNORE_FILTERED_SITES) || - (vc.isVariant() && !vc.isBiallelic()) ) { - results.add(vc); - } else { - Map stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context); - if ( stratifiedContexts != null ) - results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); - else - results.add(vc); - } - } - - for ( VariantContext vc : results ) - vcfWriter.add(vc ,ref.getBase()); - - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public Integer treeReduce(Integer lhs, Integer rhs) { - return lhs + rhs; - } - - public void onTraversalDone(Integer sum) { - - //out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords); - Map inputTableHitCounter = engine.getInputTableHitCounter(); - for ( Entry e : inputTableHitCounter.entrySet() ) { - final String bindingName = e.getKey(); - final int counter = e.getValue(); - //final float percent = 100 * counter /(float) totalOutputVCFRecords; - //out.printf(" %-6.1f%% (%d) annotated with %s.\n", percent, counter, bindingName ); - System.out.printf(" %d annotated with %s.\n", counter, bindingName ); - } - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java deleted file mode 100755 index 714f374cf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTable.java +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -/** - * This is a container that holds all data corresponding to a single join table as specified by one -J arg (ex: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2). - * Some terminology: - * 'bindingName' is an arbitrary label for a given table that is specified on the command line with either the -B or -J arg. - * In the example above, bindingName1 is the 'local' binding name because it is attached to the join table file provided with this -J arg. bindingName2 is the 'external' binding name because - * it corresponds to some other table specified previously with another -B or -J arg. - * - * The JoinTable object stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName1.columnName in the above example), - * and the entry value is an ArrayList representing the entire join table record. - * The JoinTable object also stores some other join table parameters such as the column names that were parsed out of the file header, and the bindingNames and columnNames from the -J arg. - * - * The join operation is performed by looking up the value of the join column in the external table (the one that this table is being joined to), and then using this value to do a lookup - * on the map - if there's a hit, it will provide the record from the join table that is to be joined with the record in the external table. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTable -{ - //the list of join table column names parsed out of the file header. - private List columnNames; //not fully-qualified - - private String localBindingName; - private String externalBindingName; - private String externalColumnName; - - //stores a map entry for each record in the join table. The entry's key is the value of the join column in a given record (eg. bindingName.columnName in the above example), - //and the entry value is an ArrayList representing the entire join table record. - private HashMap> joinColumnValueToRecords = new HashMap>(); - - private int maxSize; - private boolean parsedFromFile = false; - - public JoinTable(int maxSize) { - this.maxSize = maxSize; - } - - /** - * Parses the table from the given file using the JoinTableParser. - * - * @param filename The file containing the table. - * @param localBindingName The binding name within the given file to join on. - * @param localColumnName The column name within the given file to join on. - * @param externalBindingName The binding name of another file (previously specified with either -B or -J). - * @param externalColumnName The column name in this other file to join on. - */ - public void parseFromFile(String filename, String localBindingName, String localColumnName, String externalBindingName, String externalColumnName) { - if(parsedFromFile) { - throw new ReviewedStingException("parseFromFile(" + filename +", ..) called more than once"); - } - parsedFromFile = true; - - setLocalBindingName(localBindingName); - setExternalBindingName(externalBindingName); - setExternalColumnName(externalColumnName); - - BufferedReader br = null; - try - { - br = new BufferedReader(new FileReader(filename)); - final JoinTableParser parser = new JoinTableParser(); - - //read in the header - columnNames = parser.readHeader(br); - - //get the index of the localJoinColumnName - int localColumnNameIdx = -1; - for(int i = 0; i < columnNames.size(); i++) { - final String columnName = columnNames.get(i); - if(columnName.equals(localColumnName)) { - localColumnNameIdx = i; - break; - } - } - - if(localColumnNameIdx == -1) { - throw new UserException.BadArgumentValue("-J", "The -J arg specifies an unknown column name: \"" + localColumnName + "\". It's not one of the column names in the header " + columnNames + " of the file: " + filename); - } - - //read in all records and create a map entry for each - String line; - while((line = br.readLine()) != null) { - final ArrayList columnValues = parser.parseLine(line); - if ( columnValues.size() < columnNames.size() ) - throw new UserException.BadInput("the file: " + filename + " is malformed as there are not a sufficient number of columns for this line: " + line); - final String joinColumnValue = columnValues.get(localColumnNameIdx); - put(joinColumnValue, columnValues, filename); - } - } - catch(IOException e) - { - throw new UserException.CouldNotReadInputFile(new File(filename), "Unable to parse file", e); - } - finally - { - try { - if(br != null) { - br.close(); - } - } catch(IOException e) { - throw new ReviewedStingException("Unable to close file: " + filename, e); - } - } - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName1. - * @return local binding name - */ - public String getLocalBindingName() { - return localBindingName; - } - - public void setLocalBindingName(String localBindingName) { - this.localBindingName = localBindingName; - } - - /** - * @return the list of join table column names parsed out of the file header. - */ - public List getColumnNames() { - return columnNames; //not fully-qualified - } - - protected void setColumnNames(List columnNames) { - this.columnNames = columnNames; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns columnName2. - * @return external column name - */ - public String getExternalColumnName() { - return externalColumnName; - } - - protected void setExternalColumnName( - String externalColumnName) { - this.externalColumnName = externalColumnName; - } - - /** - * If the -J arg was: -J bindingName1,/path/to/file,bindingName1.columnName=bindingName2.columnName2, - * this returns bindingName2. - * @return external binding name - */ - public String getExternalBindingName() { - return externalBindingName; - } - - protected void setExternalBindingName( - String externalBindingName) { - this.externalBindingName = externalBindingName; - } - - /** - * Whether any join table records have the given value in the join column. - * @param joinColumnValue value - * @return true if the given name value exists in the file - */ - public boolean containsJoinColumnValue(String joinColumnValue) { - return joinColumnValueToRecords.containsKey(joinColumnValue); - } - - /** - * Returns all records in the table where the join column has the given value. - * @param joinColumnValue column value - * @return row - */ - public ArrayList get(String joinColumnValue) { - return joinColumnValueToRecords.get(joinColumnValue); - } - - /** - * Adds the given record to the map. - * @param joinColumnValue value - * @param record row - * @param filename the source file name - */ - protected void put(String joinColumnValue, ArrayList record, String filename) { - if ( joinColumnValueToRecords.containsKey(joinColumnValue) ) - throw new UserException.BadInput("the file " + filename + " contains non-unique entries for the requested column, which isn't allowed."); - joinColumnValueToRecords.put(joinColumnValue, record); - if ( joinColumnValueToRecords.size() > maxSize ) - throw new UserException.BadInput("the file " + filename + " contains more than the maximum number (" + maxSize + ") of allowed rows (see the --maxJoinTableSize argument)."); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java deleted file mode 100755 index 3b6c87f90..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/JoinTableParser.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.BufferedReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Used to parse files passed to the GenomicAnnotator via the -J arg. - * The files must be tab-delimited, and the first non-empty/non-commented line - * must be a header containing column names. - * - * More information can be found here: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator - */ -public class JoinTableParser -{ - public static final String DELIMITER = "\t"; - - private List header; //column names parsed out of the header line - - - /** - * Constructor. - */ - public JoinTableParser() {} - - /** - * Returns the header and returns it. - * @param br source - * @return column names - * @throws IOException on read - */ - public List readHeader(BufferedReader br) throws IOException - { - if(header != null) { - throw new ReviewedStingException("readHeader(..) called more than once. Header is currently set to: " + header); - } - - header = Collections.unmodifiableList(parseHeader(br)); - - return header; - } - - - /** - * @return A list containing the column names. - */ - public List getHeader() { - return header; - } - - - /** - * Parses the line into an ArrayList containing the values for each column. - * - * @param line to parse - * @return tokens - */ - public ArrayList parseLine(String line) { - - final ArrayList values = Utils.split(line, DELIMITER, header.size()); - - if ( values.size() != header.size() ) { - throw new UserException.MalformedFile(String.format("Encountered a row with %d columns which is different from the number or columns in the header: %d\nHeader: " + header + "\nLine: " + values, values.size(), header.size())); - } - - return values; - } - - - /** - * Returns the header. - * @param br The file to read. - * @return ArrayList containing column names from the header. - * @throws IOException on reading - */ - public static ArrayList parseHeader(final BufferedReader br) throws IOException - { - ArrayList header = null; - - //find the 1st line that's non-empty and not a comment - String line; - while( (line = br.readLine()) != null ) { - line = line.trim(); - if ( line.isEmpty() || line.startsWith("#") ) { - continue; - } - - //parse the header - header = Utils.split(line, DELIMITER); - break; - } - - // check that header was found - if ( header == null ) { - throw new IllegalArgumentException("No header in " + br + ". All lines are either comments or empty."); - } - - return header; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java deleted file mode 100755 index 0bbfa51b4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/TranscriptToGenomicInfo.java +++ /dev/null @@ -1,1032 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.IOException; -import java.io.PrintStream; -import java.util.*; - -/** - * Takes a table of transcripts (eg. UCSC refGene, knownGene, and CCDS tables) and generates the big table which contains - * annotations for each possible variant at each transcript position (eg. 4 variants at each genomic position). - * - * Required args: - * -B - specifies the input file (ex. -B transcripts,AnnotatorInputTable,/path/to/transcript_table_file.txt) - * -n - Specifies which column(s) from the transcript table contain the gene name(s). (ex. -n name,name2 (for the UCSC refGene table)) - * WARNING: The gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. - * - * - * The map & reduce types are both TreeMap. - * Each TreeMap entry represents one line in the output file. The TreeMap key is a combination of a given output line's position (so that this key can be used to sort all output lines - * by reference order), as well as allele and gene names (so that its unique across all output lines). The String value is the output line itself. - */ -@Reference(window=@Window(start=-4,stop=4)) -@By(DataSource.REFERENCE) -@Requires(value={DataSource.REFERENCE}, referenceMetaData={ @RMD(name=TranscriptToGenomicInfo.ROD_NAME,type=AnnotatorInputTableFeature.class) } ) -public class TranscriptToGenomicInfo extends RodWalker { - public static final String ROD_NAME = "transcripts"; - - //@Argument(fullName="pass-through", shortName="t", doc="Optionally specifies which columns from the transcript table should be copied verbatim (aka. passed-through) to the records in the output table. For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -t id will cause the refGene id column to be copied to the output table.", required=false) - //protected String[] PASS_THROUGH_COLUMNS = {}; - - @Output - private PrintStream out; - - @Argument(fullName="unique-gene-name-columns", shortName="n", doc="Specifies which column(s) from the transcript table contains the gene name(s). For example, -B transcripts,AnnotatorInputTable,/data/refGene.txt -n name,name2 specifies that the name and name2 columns are gene names. WARNING: the gene names for each record, when taken together, should provide a unique id for that record relative to all other records in the file. If this is not the case, an error will be thrown. ", required=true) - private String[] GENE_NAME_COLUMNS = {}; - - private final char[] ALLELES = {'A','C','G','T'}; - - /** Output columns */ - private static final String[] GENOMIC_ANNOTATION_COLUMNS = { - GenomicAnnotation.CHR_COLUMN, - GenomicAnnotation.START_COLUMN, - GenomicAnnotation.END_COLUMN, - GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, - GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN}; - - private static final String OUTPUT_TRANSCRIPT_STRAND = "transcriptStrand"; //rg. +/- - private static final String OUTPUT_IN_CODING_REGION = "inCodingRegion"; //eg. true - private static final String OUTPUT_FRAME = "frame"; //eg. 0,1,2 - private static final String OUTPUT_POSITION_TYPE = "positionType"; //eg. utr5, cds, utr3, intron, intergenic - private static final String OUTPUT_MRNA_COORD = "mrnaCoord"; //1-based offset within the transcript - private static final String OUTPUT_SPLICE_DISTANCE = "spliceDist"; //eg. integer, bp to nearest exon/intron boundary - private static final String OUTPUT_CODON_NUMBER = "codonCoord"; //eg. 20 - private static final String OUTPUT_REFERENCE_CODON = "referenceCodon"; - private static final String OUTPUT_REFERENCE_AA = "referenceAA"; - private static final String OUTPUT_VARIANT_CODON = "variantCodon"; - private static final String OUTPUT_VARIANT_AA = "variantAA"; - private static final String OUTPUT_CHANGES_AMINO_ACID = "changesAA"; //eg. true - private static final String OUTPUT_FUNCTIONAL_CLASS = "functionalClass"; //eg. missense - private static final String OUTPUT_CODING_COORD_STR = "codingCoordStr"; - private static final String OUTPUT_PROTEIN_COORD_STR = "proteinCoordStr"; - private static final String OUTPUT_SPLICE_INFO = "spliceInfo"; //(eg "splice-donor -4", or "splice-acceptor 3") for the 10bp surrounding each exon/intron boundary - private static final String OUTPUT_UORF_CHANGE = "uorfChange"; // (eg +1 or -1, indicating the addition or interruption of an ATG trinucleotide in the annotated utr5) - private static final String[] TRANSCRIPT_COLUMNS = { - OUTPUT_TRANSCRIPT_STRAND, - OUTPUT_POSITION_TYPE, - OUTPUT_FRAME, - OUTPUT_MRNA_COORD, - OUTPUT_CODON_NUMBER, - OUTPUT_SPLICE_DISTANCE, - OUTPUT_REFERENCE_CODON, - OUTPUT_REFERENCE_AA, - OUTPUT_VARIANT_CODON, - OUTPUT_VARIANT_AA, - OUTPUT_CHANGES_AMINO_ACID, - OUTPUT_FUNCTIONAL_CLASS, - OUTPUT_CODING_COORD_STR, - OUTPUT_PROTEIN_COORD_STR, - OUTPUT_IN_CODING_REGION, - OUTPUT_SPLICE_INFO, - OUTPUT_UORF_CHANGE }; - - //This list specifies the order of output columns in the big table. - private final List outputColumnNames = new LinkedList(); - - private int transcriptsProcessedCounter = 0; - - private long transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter = 0; - private long transcriptsThatDontStartWithMethionineCounter = 0; - private long transcriptsThatDontEndWithStopCodonCounter = 0; - private long skippedTranscriptCounter = 0; - - private long skippedPositionsCounter = 0; - private long totalPositionsCounter = 0; - - /** Possible values for the "POSITION_TYPE" output column. */ - private enum PositionType { - intergenic, intron, utr5, CDS, utr3, non_coding_exon, non_coding_intron - } - - /** - * Store rods until we hit their ends so that we don't have to recompute - * basic information every time we see them in map(). - */ - private Map storedTranscriptInfo = new HashMap(); - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - - //parse the GENE_NAME_COLUMNS arg and validate the column names - final List parsedGeneNameColumns = new LinkedList(); - for(String token : GENE_NAME_COLUMNS) { - parsedGeneNameColumns.addAll(Arrays.asList(token.split(","))); - } - GENE_NAME_COLUMNS = parsedGeneNameColumns.toArray(GENE_NAME_COLUMNS); - - ReferenceOrderedDataSource transcriptsDataSource = null; - for(ReferenceOrderedDataSource ds : getToolkit().getRodDataSources()) { - if(ds.getName().equals(ROD_NAME)) { - transcriptsDataSource = ds; - break; - } - } - - // sanity check - if ( transcriptsDataSource == null ) - throw new IllegalStateException("No rod bound to " + ROD_NAME + " found in rod sources"); - - final ArrayList header; - try { - header = AnnotatorInputTableCodec.readHeader(transcriptsDataSource.getFile()); - } catch(Exception e) { - throw new UserException.MalformedFile(transcriptsDataSource.getFile(), "Failed when attempting to read header from file", e); - } - - for ( String columnName : GENE_NAME_COLUMNS ) { - if ( !header.contains(columnName) ) - throw new UserException.CommandLineException("The column name '" + columnName + "' provided to -n doesn't match any of the column names in: " + transcriptsDataSource.getFile()); - } - - //init outputColumnNames list - outputColumnNames.addAll(Arrays.asList(GENOMIC_ANNOTATION_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(GENE_NAME_COLUMNS)); - outputColumnNames.addAll(Arrays.asList(TRANSCRIPT_COLUMNS)); - - //init OUTPUT_HEADER_LINE - StringBuilder outputHeaderLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputHeaderLine.length() != 0) { - outputHeaderLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - outputHeaderLine.append(column); - } - - out.println(outputHeaderLine.toString()); - } - - public Integer reduceInit() { return 0; } - - /** - * For each site of interest, generate the appropriate fields. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return 1 if the locus was successfully processed, 0 if otherwise - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - final Collection rods = tracker.getBoundRodTracks(); - //if there's nothing overlapping this locus, skip it. - if ( rods.size() == 0 ) - return 0; - - final List transcriptRODs = tracker.getReferenceMetaData(ROD_NAME); - - //there may be multiple transcriptRODs that overlap this locus - for ( Object transcriptRodObject : transcriptRODs ) { - //parse this ROD if it hasn't been already. - final AnnotatorInputTableFeature transcriptRod = (AnnotatorInputTableFeature) transcriptRodObject; - String featureKey = transcriptRod.toString(); - - TranscriptTableRecord parsedTranscriptRod = storedTranscriptInfo.get(featureKey); - if ( parsedTranscriptRod == null ) { - parsedTranscriptRod = new TranscriptTableRecord(transcriptRod, GENE_NAME_COLUMNS); - storedTranscriptInfo.put(featureKey, parsedTranscriptRod); - } - - //populate parsedTranscriptRod.txSequence - if(parsedTranscriptRod.positiveStrand) { - parsedTranscriptRod.txSequence.append((char)ref.getBase()); - } else { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - parsedTranscriptRod.txSequence.insert(0, complementBase); - } - - //populate parsedTranscriptRod.utr5Sequence and parsedTranscriptRod.cdsSequence - final int position = (int) ref.getLocus().getStart(); - if(parsedTranscriptRod.isProteinCodingTranscript() && parsedTranscriptRod.isWithinExon(position) ) - { - //we're within an exon of a proteinCodingTranscript - - if(parsedTranscriptRod.positiveStrand) - { - if(position < parsedTranscriptRod.cdsStart) - { - parsedTranscriptRod.utr5Sequence.append((char)ref.getBase()); //within utr5 - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.append((char)ref.getBase()); //within CDS - } - } - else - { - final char complementBase = (char)BaseUtils.simpleComplement(ref.getBase()); - if(position > parsedTranscriptRod.cdsEnd) - { - //As we move left to right (aka. 3' to 5'), we do insert(0,..) to reverse the sequence so that it become 5' to 3' in parsedTranscriptRod.utr5Sequence. - parsedTranscriptRod.utr5Sequence.insert(0,complementBase); //within utr5. - } - else if(position >= parsedTranscriptRod.cdsStart && position <= parsedTranscriptRod.cdsEnd) - { - parsedTranscriptRod.cdsSequence.insert(0,complementBase); //within CDS - } - } - } - - if ( position == parsedTranscriptRod.txEnd ) { - //we've reached the end of the transcript - compute all data and write it out. - try { - generateOutputRecordsForROD(parsedTranscriptRod); - } - catch(IOException e) { - throw new RuntimeException(Thread.currentThread().getName() + " - Unexpected error occurred at position: [" + parsedTranscriptRod.txChrom + ":" + position + "] in transcript: " + parsedTranscriptRod, e); - } - - // remove it from the cache - storedTranscriptInfo.remove(featureKey); - - transcriptsProcessedCounter++; - if ( transcriptsProcessedCounter % 100 == 0 ) - logger.info(new Date() + ": " + transcriptsProcessedCounter + " transcripts processed"); - } - } - - return 1; - } - - private static boolean isChrM(final TranscriptTableRecord record) { - return record.txChrom.equals("chrM") || record.txChrom.equals("MT")|| record.txChrom.equals("CRS"); - } - - private void generateOutputRecordsForROD(TranscriptTableRecord parsedTranscriptRod) throws IOException { - //Transcripts that don't produce proteins are indicated in transcript by cdsStart == cdsEnd - //These will be handled by generating only one record, with haplotypeAlternate == "*". - final boolean isProteinCodingTranscript = parsedTranscriptRod.isProteinCodingTranscript(); - final boolean isMitochondrialTranscript = isChrM(parsedTranscriptRod); - - final boolean positiveStrand = parsedTranscriptRod.positiveStrand; //alias - - - if(isProteinCodingTranscript && parsedTranscriptRod.cdsSequence.length() % 3 != 0) { - if (!isMitochondrialTranscript) { - logger.error("ERROR: Transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. Skipping..."); - //discard transcripts where CDS length is not a multiple of 3 - skippedTranscriptCounter++; - return; - } else { - - //In mitochondrial genes, the polyA tail may complete the stop codon, allowing transcript . To check for this special case: - //1. check that the CDS covers the entire transcript - //2. add 1 or 2 A's to the 3' end of the transcript (as needed to make it divisible by 3) - //3. check whether the last 3 letters now form a stop codon using the mitochondrial AA table - //4. If not, skip this gene, else incorporate the A's and process it like any other gene. - - if( parsedTranscriptRod.txSequence.length() == parsedTranscriptRod.cdsSequence.length()) { - do { //append A's until sequence length is divisible by 3 - parsedTranscriptRod.txSequence.append('*'); - parsedTranscriptRod.cdsSequence.append('a'); - if(positiveStrand) { - parsedTranscriptRod.txEnd++; - parsedTranscriptRod.cdsEnd++; - parsedTranscriptRod.exonEnds[0]++; - } else { - parsedTranscriptRod.txStart--; - parsedTranscriptRod.cdsStart--; - parsedTranscriptRod.exonStarts[0]--; - } - } while( parsedTranscriptRod.cdsSequence.length() % 3 != 0); - - } else { - logger.error("ERROR: Mitochnodrial transcript " + parsedTranscriptRod +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] has " + parsedTranscriptRod.cdsSequence.length() + " nucleotides in its CDS region, which is not divisible by 3. The CDS does not cover the entire transcript, so its not possible to use A's from the polyA tail. Skipping..."); - skippedTranscriptCounter++; - return; - } - } - } - - - //warn if the first codon isn't Methionine and/or the last codon isn't a stop codon. - if(isProteinCodingTranscript) { - final int cdsSequenceLength = parsedTranscriptRod.cdsSequence.length(); - - final String firstCodon = parsedTranscriptRod.cdsSequence.substring(0, 3); - final AminoAcid firstAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( firstCodon, true ) : AminoAcidTable.getEukaryoticAA( firstCodon ) ; - - final String lastCodon = parsedTranscriptRod.cdsSequence.substring(cdsSequenceLength - 3, cdsSequenceLength); - final AminoAcid lastAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA( lastCodon, false ) : AminoAcidTable.getEukaryoticAA( lastCodon ) ; - - if( firstAA != AminoAcidTable.METHIONINE && !lastAA.isStop()) { - transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine or end in a stop codon. The first codon is: " + firstCodon + " (" + firstAA + "). The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if( firstAA != AminoAcidTable.METHIONINE) { - transcriptsThatDontStartWithMethionineCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not start with Methionine. The first codon is: " + firstCodon + " (" + firstAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } else if(!lastAA.isStop()) { - transcriptsThatDontEndWithStopCodonCounter++; - logger.warn("WARNING: The CDS of transcript " + parsedTranscriptRod.geneNames[0] +" at position ["+ parsedTranscriptRod.txChrom + ":" +parsedTranscriptRod.txStart + "-" + parsedTranscriptRod.txEnd + "] does not end in a stop codon. The last codon is: " + lastCodon + " (" + lastAA + "). NOTE: This is just a warning - the transcript will be included in the output."); - } - } - - final int txStart_5prime = positiveStrand ? parsedTranscriptRod.txStart : parsedTranscriptRod.txEnd; //1-based, inclusive - final int txEnd_3prime = positiveStrand ? parsedTranscriptRod.txEnd : parsedTranscriptRod.txStart; //1-based, inclusive - final int increment_5to3 = positiveStrand ? 1 : -1; //whether to increment or decrement - final int strandSign = increment_5to3; //alias - - final int cdsStart_5prime = positiveStrand ? parsedTranscriptRod.cdsStart : parsedTranscriptRod.cdsEnd; //1-based, inclusive - final int cdsEnd_3prime = positiveStrand ? parsedTranscriptRod.cdsEnd : parsedTranscriptRod.cdsStart ; //1-based, inclusive - - int frame = 0; //the frame of the current position - int txOffset_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand - int utr5Count_from5 = 0; - int mrnaCoord_from5 = 1; //goes from txStart 5' to txEnd 3' for both + and - strand, but only counts bases within exons. - char[] utr5NucBuffer_5to3 = null; //used to find uORFs - size = 5 because to hold the 3 codons that overlap any given position: [-2,-1,0], [-1,0,1], and [0,1,2] - - int codonCount_from5 = 1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - counts the number of codons - 1-based - int codingCoord_from5 = isProteinCodingTranscript ? parsedTranscriptRod.computeInitialCodingCoord() : -1; //goes from cdsStart 5' to cdsEnd 3' for both + and - strand - boolean codingCoordResetForCDS = false; - boolean codingCoordResetForUtr3 = false; - final char[] currentCodon_5to3 = isProteinCodingTranscript ? new char[3] : null; //holds the current RNA codon - 5' to 3' - - PositionType positionType = null; - boolean isWithinIntronAndFarFromSpliceJunction = false; - int intronStart_5prime = -1; - int intronEnd_5prime; - - final Map outputLineFields = new HashMap(); - - for(int txCoord_5to3 = txStart_5prime; txCoord_5to3 != txEnd_3prime + increment_5to3; txCoord_5to3 += increment_5to3) - { - ++totalPositionsCounter; - - //compute certain attributes of the current position - final boolean isWithinExon = parsedTranscriptRod.isWithinExon(txCoord_5to3); //TODO if necessary, this can be sped up by keeping track of current exon/intron - - final int distanceToNearestSpliceSite = parsedTranscriptRod.computeDistanceToNearestSpliceSite(txCoord_5to3); - final boolean isWithin10bpOfSpliceJunction = Math.abs(distanceToNearestSpliceSite) <= 10; - - - //increment coding coord is necessary - if(isWithinExon) { - codingCoord_from5++; - } - - //figure out the current positionType - final PositionType prevPositionType = positionType; //save the position before it is updated - if(isProteinCodingTranscript) - { - if(isWithinExon) - { - if( strandSign*(txCoord_5to3 - cdsStart_5prime) < 0 ) { //utr5 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr5; - } else if( strandSign*(txCoord_5to3 - cdsEnd_3prime) > 0 ) { //utr3 (multiplying by strandSign is like doing absolute value.) - positionType = PositionType.utr3; - } else { - positionType = PositionType.CDS; - } - } else { - positionType = PositionType.intron; - } - } else { - if(isWithinExon) { - positionType = PositionType.non_coding_exon; - } else { - positionType = PositionType.non_coding_intron; - } - } - - //handle transitions - if(positionType == PositionType.CDS && prevPositionType != PositionType.CDS && !codingCoordResetForCDS) { - //transitioning from utr5 to CDS, reset the coding coord from -1 to 1. - codingCoord_from5 = 1; - codingCoordResetForCDS = true; - } else if(positionType == PositionType.utr3 && prevPositionType != PositionType.utr3 && !codingCoordResetForUtr3) { - //transitioning from CDS to utr3, reset the coding coord to 1. - codingCoord_from5 = 1; - codingCoordResetForUtr3 = true; - } - - - try - { - //handle introns - boolean wasWithinIntronAndFarFromSpliceJunction = isWithinIntronAndFarFromSpliceJunction; - isWithinIntronAndFarFromSpliceJunction = !isWithinExon && !isWithin10bpOfSpliceJunction; - - if(!wasWithinIntronAndFarFromSpliceJunction && isWithinIntronAndFarFromSpliceJunction) { - //save intron start - intronStart_5prime = txCoord_5to3; - - } else if(wasWithinIntronAndFarFromSpliceJunction && !isWithinIntronAndFarFromSpliceJunction) { - //output intron record - intronEnd_5prime = txCoord_5to3 - increment_5to3; - - final int intronStart = (intronStart_5prime < intronEnd_5prime ? intronStart_5prime : intronEnd_5prime) ; - final int intronEnd = (intronEnd_5prime > intronStart_5prime ? intronEnd_5prime : intronStart_5prime); - outputLineFields.clear(); - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(intronStart)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(intronEnd)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( '*' ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - - if ( isProteinCodingTranscript ) - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - - addThisLineToResult(outputLineFields); - } - - //when in utr5, compute the utr5NucBuffer_5to3 which is later used to compute the OUTPUT_UORF_CHANGE field - if(positionType == PositionType.utr5) - { - if(utr5Count_from5 < parsedTranscriptRod.utr5Sequence.length()) - { - if(utr5NucBuffer_5to3 == null) { - //initialize - utr5NucBuffer_5to3 = new char[5]; - utr5NucBuffer_5to3[3] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 ); - - if(utr5Count_from5 + 1 < parsedTranscriptRod.utr5Sequence.length() ) { - utr5NucBuffer_5to3[4] = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 1 ); - } - } - - //as we move 5' to 3', shift nucleotides down to the 5' end, making room for the new 3' nucleotide: - utr5NucBuffer_5to3[0] = utr5NucBuffer_5to3[1]; - utr5NucBuffer_5to3[1] = utr5NucBuffer_5to3[2]; - utr5NucBuffer_5to3[2] = utr5NucBuffer_5to3[3]; - utr5NucBuffer_5to3[3] = utr5NucBuffer_5to3[4]; - - char nextRefBase = 0; - if( utr5Count_from5 + 2 < parsedTranscriptRod.utr5Sequence.length() ) - { - nextRefBase = parsedTranscriptRod.utr5Sequence.charAt( utr5Count_from5 + 2 ); - } - utr5NucBuffer_5to3[4] = nextRefBase; - - //check for bad bases - if( (utr5NucBuffer_5to3[0] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[0])) || - (utr5NucBuffer_5to3[1] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[1])) || - (utr5NucBuffer_5to3[2] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[2])) || - (utr5NucBuffer_5to3[3] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[3])) || - (utr5NucBuffer_5to3[4] != 0 && !BaseUtils.isRegularBase(utr5NucBuffer_5to3[4]))) - { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5NucBuffer_5to3 contains irregular base:" + utr5NucBuffer_5to3[0] + utr5NucBuffer_5to3[1] + utr5NucBuffer_5to3[2] + utr5NucBuffer_5to3[3] + utr5NucBuffer_5to3[4]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } else { // if(utr5Count_from5 >= parsedTranscriptRod.utr5Sequence.length()) - //defensive programming - throw new RuntimeException("Exception: Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". utr5Count_from5 is now " + utr5Count_from5 + ", while parsedTranscriptRod.utr5Sequence.length() == " + parsedTranscriptRod.utr5Sequence.length() + ". This means parsedTranscriptRod.utr5Sequence isn't as long as it should be. This is a bug in handling this record: " + parsedTranscriptRod); - - } - } - - - //when in CDS, compute current codon - if(positionType == PositionType.CDS) - { - if(frame == 0) - { - currentCodon_5to3[0] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 - 1 ); //subtract 1 to go to zero-based coords - currentCodon_5to3[1] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 ); - currentCodon_5to3[2] = parsedTranscriptRod.cdsSequence.charAt( codingCoord_from5 + 1); - } - - //check for bad bases - if(!BaseUtils.isRegularBase(currentCodon_5to3[0]) || !BaseUtils.isRegularBase(currentCodon_5to3[1]) || !BaseUtils.isRegularBase(currentCodon_5to3[2])) { - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() +". CDS codon contains irregular base:" + currentCodon_5to3[0] + currentCodon_5to3[1] + currentCodon_5to3[2]);// +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - } - - char haplotypeReference = parsedTranscriptRod.txSequence.charAt( txOffset_from5 - 1 ); - if(!positiveStrand) { - haplotypeReference = BaseUtils.simpleComplement(haplotypeReference); //txSequence contents depend on whether its +/- strand - } - char haplotypeReferenceStrandSpecific= positiveStrand ? haplotypeReference : BaseUtils.simpleComplement(haplotypeReference); - - - - if(!BaseUtils.isRegularBase(haplotypeReference) && haplotypeReference != '*') { //* is special case for mitochondrial genes where polyA tail completes the last codon - //check for bad bases - logger.debug("Skipping current position [" + parsedTranscriptRod.txChrom + ":" +txCoord_5to3 + "] in transcript " + parsedTranscriptRod.geneNames.toString() + ". The reference contains an irregular base:" + haplotypeReference); // +". Transcript is: " + parsedTranscriptRod); - ++skippedPositionsCounter; - continue; - } - - - char haplotypeAlternateStrandSpecific; - for(char haplotypeAlternate : ALLELES ) - { - haplotypeAlternateStrandSpecific= positiveStrand ? haplotypeAlternate : BaseUtils.simpleComplement(haplotypeAlternate); - outputLineFields.clear(); - - if(!isProteinCodingTranscript || isWithinIntronAndFarFromSpliceJunction) { - haplotypeReference = '*'; - haplotypeAlternate = '*'; - } - - //compute simple OUTPUT fields. - outputLineFields.put(GenomicAnnotation.CHR_COLUMN, parsedTranscriptRod.txChrom); - outputLineFields.put(GenomicAnnotation.START_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.END_COLUMN, String.valueOf(txCoord_5to3)); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_REFERENCE_COLUMN, Character.toString( haplotypeReference ) ); - outputLineFields.put(GenomicAnnotation.HAPLOTYPE_ALTERNATE_COLUMN, Character.toString( haplotypeAlternate ) ); - for(int i = 0; i < GENE_NAME_COLUMNS.length; i++) { - outputLineFields.put(GENE_NAME_COLUMNS[i], parsedTranscriptRod.geneNames[i] ); - } - - outputLineFields.put(OUTPUT_POSITION_TYPE, positionType.toString() ); - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - if(isWithinExon) { - outputLineFields.put(OUTPUT_MRNA_COORD, Integer.toString(mrnaCoord_from5) ); - } - outputLineFields.put(OUTPUT_SPLICE_DISTANCE, Integer.toString(distanceToNearestSpliceSite) ); - - //compute OUTPUT_SPLICE_INFO - final String spliceInfoString; - if(isWithin10bpOfSpliceJunction) { - if(distanceToNearestSpliceSite < 0) { - //is on the 5' side of the splice junction - if(isWithinExon) { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } - } else { - if(isWithinExon) { - spliceInfoString = "splice-acceptor_" + distanceToNearestSpliceSite; - } else { - spliceInfoString = "splice-donor_" + distanceToNearestSpliceSite; - } - } - outputLineFields.put(OUTPUT_SPLICE_INFO, spliceInfoString); - } - - //compute OUTPUT_IN_CODING_REGION - if(isProteinCodingTranscript) - { - outputLineFields.put(OUTPUT_IN_CODING_REGION, Boolean.toString(positionType == PositionType.CDS) ); - } - - - //compute OUTPUT_UORF_CHANGE - if(positionType == PositionType.utr5) - { - String refCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + utr5NucBuffer_5to3[2]).toUpperCase(); - String refCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(utr5NucBuffer_5to3[2]) + utr5NucBuffer_5to3[3]).toUpperCase(); - String refCodon3 = (Character.toString(utr5NucBuffer_5to3[2]) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - String varCodon1 = (Character.toString(utr5NucBuffer_5to3[0]) + Character.toString(utr5NucBuffer_5to3[1]) + haplotypeAlternateStrandSpecific).toUpperCase(); - String varCodon2 = (Character.toString(utr5NucBuffer_5to3[1]) + Character.toString(haplotypeAlternateStrandSpecific) + utr5NucBuffer_5to3[3]).toUpperCase(); - String varCodon3 = (Character.toString(haplotypeAlternateStrandSpecific) + Character.toString(utr5NucBuffer_5to3[3]) + utr5NucBuffer_5to3[4]).toUpperCase(); - - //check for +1 (eg. addition of new ATG uORF) and -1 (eg. disruption of existing ATG uORF) - String uORFChangeStr = null; - if( (refCodon1.equals("ATG") && !varCodon1.equals("ATG")) || - (refCodon2.equals("ATG") && !varCodon2.equals("ATG")) || - (refCodon3.equals("ATG") && !varCodon3.equals("ATG"))) - { - uORFChangeStr = "-1"; - } - else if((varCodon1.equals("ATG") && !refCodon1.equals("ATG")) || - (varCodon2.equals("ATG") && !refCodon2.equals("ATG")) || - (varCodon3.equals("ATG") && !refCodon3.equals("ATG"))) - { - uORFChangeStr = "+1"; - } - - outputLineFields.put(OUTPUT_UORF_CHANGE, uORFChangeStr ); - } - //compute CDS-specific fields - else if (positionType == PositionType.CDS) { - final String referenceCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - final char temp = currentCodon_5to3[frame]; - currentCodon_5to3[frame] = haplotypeAlternateStrandSpecific; - final String variantCodon = Character.toString(currentCodon_5to3[0]) + Character.toString(currentCodon_5to3[1]) + currentCodon_5to3[2]; - currentCodon_5to3[frame] = temp; - - final AminoAcid refAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(referenceCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( referenceCodon ) ; - final AminoAcid variantAA = isMitochondrialTranscript ? AminoAcidTable.getMitochondrialAA(variantCodon, codonCount_from5 == 1) : AminoAcidTable.getEukaryoticAA( variantCodon ) ; - - if (refAA.isUnknown() || variantAA.isUnknown()) { - logger.warn("Illegal amino acid detected: refCodon=" + referenceCodon + " altCodon=" + variantCodon); - } - outputLineFields.put(OUTPUT_TRANSCRIPT_STRAND, positiveStrand ? "+" : "-" ); - outputLineFields.put(OUTPUT_FRAME, Integer.toString(frame)); - outputLineFields.put(OUTPUT_CODON_NUMBER, Integer.toString(codonCount_from5)); - outputLineFields.put(OUTPUT_REFERENCE_CODON, referenceCodon); - outputLineFields.put(OUTPUT_REFERENCE_AA, refAA.getCode()); - - outputLineFields.put(OUTPUT_VARIANT_CODON, variantCodon); - outputLineFields.put(OUTPUT_VARIANT_AA, variantAA.getCode()); - - outputLineFields.put(OUTPUT_PROTEIN_COORD_STR, "p." + refAA.getLetter() + Integer.toString(codonCount_from5) + variantAA.getLetter()); //for example: "p.K7$ - - boolean changesAA = !refAA.equals(variantAA); - outputLineFields.put(OUTPUT_CHANGES_AMINO_ACID, Boolean.toString(changesAA)); - final String functionalClass; - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } else if (refAA.isStop()) { - functionalClass = "readthrough"; - } else { - functionalClass = "missense"; - } - } else { - functionalClass = "silent"; - } - outputLineFields.put(OUTPUT_FUNCTIONAL_CLASS, functionalClass); - } - - //compute OUTPUT_CODING_COORD_STR - if(isProteinCodingTranscript) - { - //compute coding coord - final StringBuilder codingCoordStr = new StringBuilder(); - codingCoordStr.append( "c." ); - if(positionType == PositionType.utr3) { - codingCoordStr.append( '*' ); - } - - if(isWithinExon) { - codingCoordStr.append( Integer.toString(codingCoord_from5) ); - - codingCoordStr.append ( haplotypeReferenceStrandSpecific + ">" + haplotypeAlternateStrandSpecific); - } else { - //intronic coordinates - if(distanceToNearestSpliceSite < 0) { - codingCoordStr.append( Integer.toString(codingCoord_from5 + 1) ); - } else { - codingCoordStr.append( Integer.toString(codingCoord_from5 ) ); - codingCoordStr.append( "+" ); - } - - codingCoordStr.append( Integer.toString( distanceToNearestSpliceSite ) ); - } - - outputLineFields.put(OUTPUT_CODING_COORD_STR, codingCoordStr.toString()); - } - - - //generate the output line and add it to 'result' map. - if ( !isWithinIntronAndFarFromSpliceJunction ) - addThisLineToResult(outputLineFields); - - if( haplotypeAlternate == '*' ) { - //need only one record for this position with "*" for haplotypeAlternate, instead of the 4 individual alleles - break; - } - - } //ALLELE for-loop - } - finally - { - //increment coords - txOffset_from5++; - if(isWithinExon) { - mrnaCoord_from5++; - } - - if(positionType == PositionType.utr5) { - utr5Count_from5++; - } else if(positionType == PositionType.CDS) { - frame = (frame + 1) % 3; - if(frame == 0) { - codonCount_from5++; - } - } - } - } // l for-loop - - } //method close - - - /** - * Utility method. Creates a line containing the outputLineFields, and adds it to result, hashed by the sortKey. - * - * @param outputLineFields Column-name to value pairs. - */ - private void addThisLineToResult(final Map outputLineFields) { - final StringBuilder outputLine = new StringBuilder(); - for( final String column : outputColumnNames ) { - if(outputLine.length() != 0) { - outputLine.append( AnnotatorInputTableCodec.DELIMITER ); - } - final String value = outputLineFields.get(column); - if(value != null) { - outputLine.append(value); - } - } - - out.println(outputLine.toString()); - } - - public Integer reduce(Integer value, Integer sum) { return sum + value; } - - public void onTraversalDone(Integer result) { - logger.info("Skipped " + skippedPositionsCounter + " in-transcript genomic positions out of "+ totalPositionsCounter + " total (" + ( totalPositionsCounter == 0 ? 0 : (100*skippedPositionsCounter)/totalPositionsCounter) + "%)"); - logger.info("Skipped " + skippedTranscriptCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*skippedTranscriptCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine or end in a stop codon: " + transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineOrEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't start with Methionine: " + transcriptsThatDontStartWithMethionineCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontStartWithMethionineCounter)/transcriptsProcessedCounter) + "%)"); - logger.info("Protein-coding transcripts (eg. with a CDS region) that don't end in a stop codon: " + transcriptsThatDontEndWithStopCodonCounter + " transcripts out of "+ transcriptsProcessedCounter + " total (" + ( transcriptsProcessedCounter == 0 ? 0 : (100*transcriptsThatDontEndWithStopCodonCounter)/transcriptsProcessedCounter) + "%)"); - } - - - /** - * Container for all data fields from a single row of the transcript table. - */ - protected static class TranscriptTableRecord - { - public static final String STRAND_COLUMN = "strand"; //eg. + - public static final String CDS_START_COLUMN = "cdsStart"; - public static final String CDS_END_COLUMN = "cdsEnd"; - public static final String EXON_COUNT_COLUMN = "exonCount"; - public static final String EXON_STARTS_COLUMN = "exonStarts"; - public static final String EXON_ENDS_COLUMN = "exonEnds"; - //public static final String EXON_FRAMES_COLUMN = "exonFrames"; - - - /** - * This StringBuffer accumulates the entire transcript sequence. - * This buffer is used instead of using the GATK window mechanism - * because arbitrary-length look-aheads and look-behinds are needed to deal - * with codons that span splice-junctions in + & - strand transcripts. - * The window mechanism requires hard-coding the window size, which would - * translate into a limit on maximum supported intron size. To avoid this, the - * sequence is accumulated as the transcript is scanned left-to-right. - * Then, all calculations are performed at the end. - */ - public StringBuilder txSequence; //the sequence of the entire transcript in order from 5' to 3' - public StringBuilder utr5Sequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - public StringBuilder cdsSequence; //the protein coding sequence (with introns removed) in order from 5' to 3' - - public boolean positiveStrand; //whether the transcript is on the + or the - strand. - public String[] geneNames; //eg. NM_021649 - - public String txChrom; //The chromosome name - public int txStart; - public int txEnd; - - public int cdsStart; - public int cdsEnd; - - public int[] exonStarts; - public int[] exonEnds; - //public int[] exonFrames; - not used for anything, frame is computed another way - - /** - * Constructor. - * - * @param transcriptRod A rod representing a single record in the transcript table. - * @param geneNameColumns name columns. - */ - public TranscriptTableRecord(final AnnotatorInputTableFeature transcriptRod, String[] geneNameColumns) { - - //String binStr = transcriptRod.get("bin"); - //String idStr = transcriptRod.get("id"); //int(10) unsigned range Unique identifier ( usually 0 for some reason - even for translated ) - String strandStr = transcriptRod.getColumnValue(STRAND_COLUMN); - if(strandStr == null) { - throw new IllegalArgumentException("Transcript table record doesn't contain a 'strand' column. Make sure the transcripts input file has a header and the usual columns: \"" + strandStr + "\""); - } else if(strandStr.equals("+")) { - positiveStrand = true; - } else if(strandStr.equals("-")) { - positiveStrand = false; - } else { - throw new IllegalArgumentException("Transcript table record contains unexpected value for 'strand' column: \"" + strandStr + "\""); - } - - geneNames = new String[geneNameColumns.length]; - for(int i = 0; i < geneNameColumns.length; i++) { - geneNames[i] = transcriptRod.getColumnValue(geneNameColumns[i]); - } - - //String txStartStr = transcriptRod.get(TXSTART_COLUMN); //These fields were used to generate column 1 of the ROD file (eg. they got turned into chr:txStart-txStop) - //String txEndStr = transcriptRod.get(TXEND_COLUMN); - txChrom = transcriptRod.getChr(); - txStart = transcriptRod.getStart(); - txEnd = transcriptRod.getEnd(); - - String cdsStartStr = transcriptRod.getColumnValue(CDS_START_COLUMN); - String cdsEndStr = transcriptRod.getColumnValue(CDS_END_COLUMN); - - cdsStart = Integer.parseInt(cdsStartStr); - cdsEnd = Integer.parseInt(cdsEndStr); - - txSequence = new StringBuilder( (txEnd - txStart + 1) ); //the sequence of the entire transcript in order from 5' to 3' - if(isProteinCodingTranscript()) { - utr5Sequence = new StringBuilder( positiveStrand ? (cdsStart - txStart + 1) : (txEnd - cdsEnd + 1) ); //TODO reduce init size by size of introns - cdsSequence = new StringBuilder( (cdsEnd - cdsStart + 1) ); //TODO reduce init size by size of introns - } - - String exonCountStr = transcriptRod.getColumnValue(EXON_COUNT_COLUMN); - String exonStartsStr = transcriptRod.getColumnValue(EXON_STARTS_COLUMN); - String exonEndsStr = transcriptRod.getColumnValue(EXON_ENDS_COLUMN); - //String exonFramesStr = transcriptRod.get(EXON_FRAMES_COLUMN); - - String[] exonStartStrs = exonStartsStr.split(","); - String[] exonEndStrs = exonEndsStr.split(","); - //String[] exonFrameStrs = exonFramesStr.split(","); - - int exonCount = Integer.parseInt(exonCountStr); - if(exonCount != exonStartStrs.length || exonCount != exonEndStrs.length /* || exonCount != exonFrameStrs.length */) - { - throw new RuntimeException("exonCount != exonStarts.length || exonCount != exonEnds.length || exonCount != exonFrames.length. Exon starts: " + exonStartsStr + ", Exon ends: " + exonEndsStr + /*", Exon frames: " + exonFramesStr + */", Exon count: " + exonCountStr +". transcriptRod = " + transcriptRod); - } - - exonStarts = new int[exonCount]; - exonEnds = new int[exonCount]; - //exonFrames = new int[exonCount]; - for(int i = 0; i < exonCount; i++) { - exonStarts[i] = Integer.parseInt(exonStartStrs[i]); - exonEnds[i] = Integer.parseInt(exonEndStrs[i]); - //exonFrames[i] = Integer.parseInt(exonFrameStrs[i]); - } - } - - - /** - * Takes a genomic position on the same contig as the transcript, and - * returns true if this position falls within an exon. - */ - public boolean isWithinExon(final int genomPosition) { - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - if(genomPosition < curStart) { - return false; - } - final int curStop = exonEnds[i]; - if(genomPosition <= curStop) { - return true; - } - } - - return false; - } - - /** - * Computes the distance to the nearest splice-site. - * The returned value is negative its on the 5' side (eg. upstream) of the juntion, and - * positive if its on the 3' side. - */ - public int computeDistanceToNearestSpliceSite(final int genomPosition) { - int prevDistance = Integer.MAX_VALUE; - for(int i = 0; i < exonStarts.length; i++) { - final int curStart = exonStarts[i]; - int curDistance = curStart - genomPosition; - if(genomPosition < curStart) { - //position is within the current intron - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStart + 1; - } - - final int curStop = exonEnds[i]; - curDistance = curStop - genomPosition + 1; - if(genomPosition <= curStop) { - //position is within an exon - if(prevDistance < curDistance) { - return positiveStrand ? prevDistance : -prevDistance; - } else { - return positiveStrand ? -curDistance : curDistance; - } - } else { - prevDistance = genomPosition - curStop; - } - } - - throw new IllegalArgumentException("Genomic position: [" + genomPosition +"] not found within transcript: " + this +". " + - "This method should not have been called for this position. NOTE: this method assumes that all transcripts start " + - "with an exon and end with an exon (rather than an intron). Is this wrong?"); - //return prevDistance; //out of exons. return genomPosition-curStop - } - - - /** - * Returns true if this is a coding transcript (eg. is translated - * into proteins). Returns false for non-coding RNA. - */ - public boolean isProteinCodingTranscript() { - return cdsStart < cdsEnd; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("chrpos=" + txChrom + ':' + txStart + '-' + txEnd + ", strand=" + (positiveStrand ? '+':'-') + ", gene-names=" + Arrays.toString(geneNames) + ", cds="+ cdsStart + '-' + cdsEnd + ", exonStarts=" + Arrays.toString(exonStarts) + ", exonEnds=" + Arrays.toString(exonEnds)); - return sb.toString(); - } - - - - /** - * Computes the coding coord of the 1st nucleotide in the transcript. - * If the 1st nucleotide is in the 5'utr, the returned value will be negative. - * Otherwise (if the 1st nucleotide is CDS), the returned value is 1. - */ - public int computeInitialCodingCoord() { - if(!isProteinCodingTranscript()) { - throw new ReviewedStingException("This method should only be called for protein-coding transcripts"); - } - - if(positiveStrand) - { - if( cdsStart == exonStarts[0] ) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = 0; i < exonStarts.length; i++) - { - final int exonStart = exonStarts[i]; - final int exonEnd = exonEnds[i]; - if(cdsStart <= exonEnd) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += (cdsStart - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += (exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - else //(negative strand) - { - final int cdsStart_5prime = cdsEnd; - if(cdsStart_5prime == exonEnds[exonEnds.length - 1]) { - //the 1st nucleotide of the transcript is CDS. - return 1; - } - - int result = 0; - for(int i = exonEnds.length - 1; i >= 0; i--) - { - final int exonStart = exonEnds[i]; //when its the negative strand, the 5' coord of the 1st exon is exonEnds[i] - final int exonEnd = exonStarts[i]; - if( exonEnd <= cdsStart_5prime ) { //eg. exonEnd is now on the 3' side of cdsStart - //this means cdsStart is within the current exon - result += -(cdsStart_5prime - exonStart) + 1; - break; - } else { - //cdsStart is downstream of the current exon - result += -(exonEnd - exonStart) + 1; - } - } - return -result; //negate because 5' UTR coding coord is negative - } - } - } - - -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java index 57bc44ab8..e982582ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -10,15 +10,12 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; import java.util.Map; -public interface GenotypeAnnotation { +public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts/genotype split by sample - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g); - - // return the FORMAT keys - public List getKeyNames(); + public abstract Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g); // return the descriptions used for the VCF FORMAT meta field - public List getDescriptions(); - + public abstract List getDescriptions(); + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java index 4e850d01b..84438ccd8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -3,21 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; import java.util.Map; -public interface InfoFieldAnnotation { - +public abstract class InfoFieldAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts split by sample - public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc); - - // return the INFO keys - public List getKeyNames(); + public abstract Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc); // return the descriptions used for the VCF INFO meta field - public List getDescriptions(); - + public abstract List getDescriptions(); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java new file mode 100644 index 000000000..f33d61df9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/VariantAnnotatorAnnotation.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.List; +import java.util.Map; + +@DocumentedGATKFeature(enable = true, groupName = "VariantAnnotator annotations", summary = "VariantAnnotator annotations") +public abstract class VariantAnnotatorAnnotation { + // return the INFO keys + public abstract List getKeyNames(); +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 21c8ec430..d0bc59fbd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -127,7 +127,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 0; if (vc_input.isFiltered()) { - vcfWriter.add(vc_input, ref.getBase()); + vcfWriter.add(vc_input); return 1; } List r2rods = tracker.getReferenceMetaData(R2_ROD_NAME); @@ -333,7 +333,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { } - vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase()); + vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes)); return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 3eed12992..2fc0d2368 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -171,20 +171,20 @@ public class ProduceBeagleInputWalker extends RodWalker { logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); + bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER)); } bootstrapSetSize++; return true; } else { if ( bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } testSetSize++; return false; } } else { if ( validation != null && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(validation,ref.getBase()); + bootstrapVCFOutput.add(validation); } return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java index f6cd1d636..5d716bed4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/VariantsToBeagleUnphasedWalker.java @@ -110,7 +110,7 @@ public class VariantsToBeagleUnphasedWalker extends RodWalker // if we are holding it back and we are writing a bootstrap VCF, write it out if ( makeMissing && bootstrapVCFOutput != null ) { - bootstrapVCFOutput.add(vc, ref.getBase()); + bootstrapVCFOutput.add(vc); } // regardless, all sites are written to the unphased genotypes file, marked as missing if appropriate diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java index 15b16ca6b..a1c043365 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/BAMDiffableReader.java @@ -29,9 +29,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecordIterator; import net.sf.samtools.util.BlockCompressedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; +import java.io.*; import java.util.Arrays; @@ -102,8 +100,10 @@ public class BAMDiffableReader implements DiffableReader { final byte[] BAM_MAGIC = "BAM\1".getBytes(); final byte[] buffer = new byte[BAM_MAGIC.length]; try { - FileInputStream fstream = new FileInputStream(file); - new BlockCompressedInputStream(fstream).read(buffer,0,BAM_MAGIC.length); + InputStream fstream = new BufferedInputStream(new FileInputStream(file)); + if ( !BlockCompressedInputStream.isValidFile(fstream) ) + return false; + new BlockCompressedInputStream(fstream).read(buffer, 0, BAM_MAGIC.length); return Arrays.equals(buffer, BAM_MAGIC); } catch ( IOException e ) { return false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java index e3910ef11..4e3342609 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffEngine.java @@ -42,6 +42,7 @@ import java.util.*; * Date: 7/4/11 * Time: 12:51 PM * A generic engine for comparing tree-structured objects + * */ public class DiffEngine { final protected static Logger logger = Logger.getLogger(DiffEngine.class); @@ -143,7 +144,7 @@ public class DiffEngine { * Not that only pairs of the same length are considered as potentially equivalent * * @param params determines how we display the items - * @param diffs + * @param diffs the list of differences to summarize */ public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { printSummaryReport(summarizeDifferences(diffs), params ); @@ -207,14 +208,7 @@ public class DiffEngine { } protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { - GATKReport report = new GATKReport(); - final String tableName = "diffences"; - report.addTable(tableName, "Summarized differences between the master and test files.\nSee http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information"); - GATKReportTable table = report.getTable(tableName); - table.addPrimaryKey("Difference", true); - table.addColumn("NumberOfOccurrences", 0); - table.addColumn("SpecificDifference", 0); - + List toShow = new ArrayList(); int count = 0, count1 = 0; for ( Difference diff : sortedSummaries ) { if ( diff.getCount() < params.minSumDiffToShow ) @@ -230,10 +224,26 @@ public class DiffEngine { break; } - table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount()); - table.set(diff.getPath(), "SpecificDifference", diff.valueDiffString()); + toShow.add(diff); } + // if we want it in descending order, reverse the list + if ( ! params.descending ) { + Collections.reverse(toShow); + } + + // now that we have a specific list of values we want to show, display them + GATKReport report = new GATKReport(); + final String tableName = "diffences"; + report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine for more information", false); + GATKReportTable table = report.getTable(tableName); + table.addPrimaryKey("Difference", true); + table.addColumn("NumberOfOccurrences", 0); + table.addColumn("ExampleDifference", 0); + for ( Difference diff : toShow ) { + table.set(diff.getPath(), "NumberOfOccurrences", diff.getCount()); + table.set(diff.getPath(), "ExampleDifference", diff.valueDiffString()); + } table.write(params.out); } @@ -252,7 +262,7 @@ public class DiffEngine { * commonPostfixLength: how many parts are shared at the end, suppose its 2 * We want to create a string *.*.C.D * - * @param parts + * @param parts the separated path values [above without .] * @param commonPostfixLength * @return */ @@ -332,12 +342,12 @@ public class DiffEngine { return reader.readFromFile(file, maxElementsToRead); } - public static boolean simpleDiffFiles(File masterFile, File testFile, DiffEngine.SummaryReportParams params) { + public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) { DiffEngine diffEngine = new DiffEngine(); if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { - DiffElement master = diffEngine.createDiffableFromFile(masterFile); - DiffElement test = diffEngine.createDiffableFromFile(testFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead); + DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead); List diffs = diffEngine.diff(master, test); diffEngine.reportSummarizedDifferences(diffs, params); return true; @@ -351,6 +361,7 @@ public class DiffEngine { int maxItemsToDisplay = 0; int maxCountOneItems = 0; int minSumDiffToShow = 0; + boolean descending = true; public SummaryReportParams(PrintStream out, int maxItemsToDisplay, int maxCountOneItems, int minSumDiffToShow) { this.out = out; @@ -358,5 +369,9 @@ public class DiffEngine { this.maxCountOneItems = maxCountOneItems; this.minSumDiffToShow = minSumDiffToShow; } + + public void setDescending(boolean descending) { + this.descending = descending; + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index 8e362dcc4..f43d1342d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -37,42 +37,171 @@ import java.io.PrintStream; import java.util.List; /** - * Compares two record-oriented files, itemizing specific difference between equivalent - * records in the two files. Reports both itemized and summarized differences. + * A generic engine for comparing tree-structured objects + * + *

+ * Compares two record-oriented files, itemizing specific difference between equivalent + * records in the two files. Reports both itemized and summarized differences. + *

+ * + *

What are the summarized differences and the DiffObjectsWalker?

+ * + *

+ * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: + *

    + *
  • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. + *
  • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. + *
+ *

+ * + *

+ * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you. + *

+ * + *

Why?

+ * + *

+ * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others. + *

+ * + *

Input

+ *

+ * The DiffObjectsWalker works with BAM or VCF files. + *

+ * + *

Output

+ *

+ * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named + * nodes. Suppose I have two trees: + *

+ *     Tree1=(A=1 B=(C=2 D=3))
+ *     Tree2=(A=1 B=(C=3 D=3 E=4))
+ *     Tree3=(A=1 B=(C=4 D=3 E=4))
+ * 
+ *

+ * where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine + * traverses these data structures by name, identifies equivalent nodes by fully qualified names + * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are). + * These itemized differences are listed as: + *

+ *     Tree1.B.C=2 != Tree2.B.C=3
+ *     Tree1.B.C=2 != Tree3.B.C=4
+ *     Tree2.B.C=3 != Tree3.B.C=4
+ *     Tree1.B.E=MISSING != Tree2.B.E=4
+ * 
+ * + *

+ * This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though + * is that it computes similarity among the itemized differences and displays the count of differences names + * in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs + * only once. So the summary is: + * + *

+ *     *.B.C : 3
+ *     *.B.E : 1
+ * 
+ * + *

+ * where the * operator indicates that any named field matches. This output is sorted by counts, and provides an + * immediate picture of the commonly occurring differences among the files. + *

+ * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, + * detected by the integrationtest integration (more below). You can see that in the although there are many specific + * instances of these differences between the two files, the summarized differences provide an immediate picture that + * the AC, AF, and AN fields are the major causes of the differences. + *

+ * + *

+   [testng] path                                                             count
+   [testng] *.*.*.AC                                                         6
+   [testng] *.*.*.AF                                                         6
+   [testng] *.*.*.AN                                                         6
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
+   [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
+
+ * * @author Mark DePristo - * @version 0.1 + * @since 7/4/11 */ @Requires(value={}) public class DiffObjectsWalker extends RodWalker { + /** + * Writes out a file of the DiffEngine format: + * + * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + */ @Output(doc="File to which results should be written",required=true) protected PrintStream out; - @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) - int MAX_OBJECTS_TO_READ = -1; - - @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) - int MAX_DIFFS = 0; - - @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) - int MAX_COUNT1_DIFFS = 0; - - @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) - int minCountForDiff = 1; - - @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) - boolean showItemizedDifferences = false; - + /** + * The master file against which we will compare test. This is one of the two required + * files to do the comparison. Conceptually master is the original file contained the expected + * results, but this doesn't currently have an impact on the calculations, but might in the future. + */ @Argument(fullName="master", shortName="m", doc="Master file: expected results", required=true) File masterFile; + /** + * The test file against which we will compare to the master. This is one of the two required + * files to do the comparison. Conceptually test is the derived file from master, but this + * doesn't currently have an impact on the calculations, but might in the future. + */ @Argument(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) File testFile; - final DiffEngine diffEngine = new DiffEngine(); + /** + * The engine will read at most this number of objects from each of master and test files. This reduces + * the memory requirements for DiffObjects but does limit you to comparing at most this number of objects + */ + @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + int MAX_OBJECTS_TO_READ = -1; + + /** + * The max number of differences to display when summarizing. For example, if there are 10M differences, but + * maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that + * the system shows differences sorted by frequency, so these 10 would be the most common between the two files. + * A value of 0 means show all possible differences. + */ + @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) + int MAX_DIFFS = 0; + + /** + * The maximum number of singleton (occurs exactly once between the two files) to display when writing out + * the summary. Only applies if maxDiffs hasn't been exceeded. For example, if maxDiffs is 10 and maxCount1Diffs + * is 2 and there are 20 diffs with count > 1, then only 10 are shown, all of which have count above 1. + */ + @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) + int MAX_COUNT1_DIFFS = 0; + + /** + * Only differences that occur more than minCountForDiff are displayed. For example, if minCountForDiff is 10, then + * a difference must occur at least 10 times between the two files to be shown. + */ + @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) + int minCountForDiff = 1; + + /** + * If provided, the system will write out the summarized, individual differences. May lead to enormous outputs, + * depending on how many differences are found. Note these are not sorted in any way, so if you have 10M + * common differences in the files, you will see 10M records, whereas the final summarize will just list the + * difference and its count of 10M. + */ + @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) + boolean showItemizedDifferences = false; + + DiffEngine diffEngine; @Override public void initialize() { - + this.diffEngine = new DiffEngine(); } @Override @@ -112,6 +241,7 @@ public class DiffObjectsWalker extends RodWalker { } DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff); + params.setDescending(false); diffEngine.reportSummarizedDifferences(diffs, params); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java index df2a5cda1..77a992ce0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/VCFDiffableReader.java @@ -72,13 +72,19 @@ public class VCFDiffableReader implements DiffableReader { } String line = lineReader.readLine(); - int count = 0; + int count = 0, nRecordsAtPos = 1; + String prevName = ""; while ( line != null ) { if ( count++ > maxElementsToRead && maxElementsToRead != -1) break; VariantContext vc = (VariantContext)vcfCodec.decode(line); String name = vc.getChr() + ":" + vc.getStart(); + if ( name.equals(prevName) ) { + name += "_" + ++nRecordsAtPos; + } else { + prevName = name; + } DiffNode vcRoot = DiffNode.empty(name, root); // add fields diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 6c023573a..2507eabbb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -272,7 +272,7 @@ public class VariantFiltrationWalker extends RodWalker { else filteredVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), filters, vc.getAttributes()); - writer.add( filteredVC, context.getReferenceContext().getBase() ); + writer.add(filteredVC); } public Integer reduce(Integer value, Integer sum) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index be2039780..60ea601d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -391,7 +391,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (DEBUG) System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength, (int)ref.getWindow().size(), loc.getStart(), numPrefBases); - + //System.out.println(eventLength); haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), ref, hsize, numPrefBases); @@ -418,8 +418,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood - // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is lexically ordered, for example - // for 3 alleles it's 00 01 02 11 12 22 + // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is ordered as for example + // for 3 alleles it's 00 01 11 02 12 22 GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, genotypeLikelihoods, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 3e3cd128b..9d917078d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -68,7 +68,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC if ( vc == null ) { vc = vc_input; } else { - logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record only"); + logger.warn("Multiple valid VCF records detected at site " + ref.getLocus() + ", only considering alleles from first record"); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java index 22c3081a3..e5e78905f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java @@ -93,7 +93,7 @@ public class UGCalcLikelihoods extends LocusWalker public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { VariantContext call = UG_engine.calculateLikelihoods(tracker, refContext, rawContext); - return call == null ? null : new VariantCallContext(call, refContext.getBase(), true); + return call == null ? null : new VariantCallContext(call, true); } public Integer reduceInit() { return 0; } @@ -107,7 +107,7 @@ public class UGCalcLikelihoods extends LocusWalker return sum; try { - writer.add(value, value.refBase); + writer.add(value); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 68d8f9b54..fd29ff87e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -62,7 +62,6 @@ public class UGCallVariants extends RodWalker { private Set trackNames = new HashSet(); public void initialize() { - UAC.NO_SLOD = true; for ( ReferenceOrderedDataSource d : getToolkit().getRodDataSources() ) { if ( d.getName().startsWith("variant") ) @@ -116,7 +115,7 @@ public class UGCallVariants extends RodWalker { try { Map attrs = new HashMap(value.getAttributes()); VariantContextUtils.calculateChromosomeCounts(value, attrs, true); - writer.add(VariantContext.modifyAttributes(value, attrs), value.refBase); + writer.add(VariantContext.modifyAttributes(value, attrs)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 2b25df4aa..52bf3f715 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -58,8 +58,8 @@ public class UnifiedArgumentCollection { @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) - public boolean NO_SLOD = false; + @Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false) + public boolean COMPUTE_SLOD = false; // control the error modes @@ -154,7 +154,7 @@ public class UnifiedArgumentCollection { uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; + uac.COMPUTE_SLOD = COMPUTE_SLOD; uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 2a0338bca..d379b05a1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -144,7 +144,7 @@ public class UnifiedGenotyper extends LocusWalker GLs) { @@ -300,7 +300,8 @@ public class UnifiedGenotyperEngine { genotypes, VariantContext.NO_NEG_LOG_10PERROR, null, - null); + null, + refContext.getBase()); } // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine @@ -372,8 +373,8 @@ public class UnifiedGenotyperEngine { attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - if ( !UAC.NO_SLOD && bestAFguess != 0 ) { - final boolean DEBUG_SLOD = false; + if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) { + //final boolean DEBUG_SLOD = false; // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); @@ -381,7 +382,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); @@ -390,7 +391,7 @@ public class UnifiedGenotyperEngine { //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); @@ -399,11 +400,11 @@ public class UnifiedGenotyperEngine { //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); // strand score is max bias between forward and reverse strands double strandScore = Math.max(forwardLod, reverseLod); @@ -425,10 +426,10 @@ public class UnifiedGenotyperEngine { myAlleles.add(vc.getReference()); } VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes); + myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, refContext.getBase()); if ( annotationEngine != null ) { - // first off, we want to use the *unfiltered* and *unBAQed* context for the annotations + // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) pileup = rawContext.getExtendedEventPileup(); @@ -436,13 +437,10 @@ public class UnifiedGenotyperEngine { pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); - vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element. + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); } - VariantCallContext call = new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - call.setRefBase(refContext.getBase()); - return call; + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java index 5896e784e..423c80112 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java @@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; * Useful helper class to communicate the results of calculateGenotype to framework */ public class VariantCallContext extends VariantContext { - public byte refBase; // Was the site called confidently, either reference or variant? public boolean confidentlyCalled = false; @@ -55,16 +54,6 @@ public class VariantCallContext extends VariantContext { this.shouldEmit = shouldEmit; } - VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) { - super(vc); - this.refBase = ref; - this.confidentlyCalled = confidentlyCalledP; - } - - public void setRefBase(byte ref) { - this.refBase = ref; - } - /* these methods are only implemented for GENOTYPE_GIVEN_ALLELES MODE */ //todo -- expand these methods to all modes diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 60262d6f4..55450486b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -1042,8 +1042,8 @@ public class PairHMMIndelErrorModel { int k=0; double maxElement = Double.NEGATIVE_INFINITY; - for (int i=0; i < hSize; i++) { - for (int j=i; j < hSize; j++){ + for (int j=0; j < hSize; j++) { + for (int i=0; i <= j; i++){ genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; if (haplotypeLikehoodMatrix[i][j] > maxElement) maxElement = haplotypeLikehoodMatrix[i][j]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 488e37f26..6453ce8de 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -64,7 +64,7 @@ public class RealignerTargetCreator extends RodWalker { filters.add("NoCall"); } VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, null); - vcf.add(vc,refBases[(int)start-1]); + -1.0 /* log error */, filters, null, refBases[(int)start-1]); + vcf.add(vc); } /** Fills l with appropriate alleles depending on whether call is insertion or deletion @@ -1130,8 +1130,8 @@ public class SomaticIndelDetectorWalker extends ReadWalker { } VariantContext vc = new VariantContext("IGv2_Indel_call", refName, start, stop, alleles, genotypes, - -1.0 /* log error */, filters, attrs); - vcf.add(vc,refBases[(int)start-1]); + -1.0 /* log error */, filters, attrs, refBases[(int)start-1]); + vcf.add(vc); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java deleted file mode 100755 index 9aa370d3f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java +++ /dev/null @@ -1,890 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.phasing; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcid; -import org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator.AminoAcidTable; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.*; - -import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods; - - -/** - * Walks along all variant ROD loci, and dynamically annotates alleles at MNP records. - */ -@Allows(value = {DataSource.REFERENCE}) -@Requires(value = {DataSource.REFERENCE}, referenceMetaData = {@RMD(name = AnnotateMNPsWalker.REFSEQ_ROD_NAME, type = AnnotatorInputTableFeature.class), @RMD(name = AnnotateMNPsWalker.VARIANT_ROD_NAME, type = ReferenceOrderedDatum.class)}) - -public class AnnotateMNPsWalker extends RodWalker { - - @Output(doc = "File to which variants should be written", required = true) - protected VCFWriter writer = null; - private ManualSortingVCFWriter sortingWriter = null; - - @Argument(fullName = "emitOnlyMNPs", shortName = "emitOnlyMNPs", doc = "Only output MNP records; [default:false]", required = false) - protected boolean emitOnlyMNPs = false; - - private LinkedList rodNames = null; - private GenomeLocParser locParser = null; - private TreeMap> MNPstartToStops = null; // Must be TreeMap sorted by START sites! - - public final static String REFSEQ_ROD_NAME = "refseq"; - public final static String VARIANT_ROD_NAME = "variant"; - - private LocusToFeatures locusToRefSeqFeatures = null; - - - protected final static String MNP_ANNOTATION_KEY_PREFIX = "MNP.refseq."; - - protected final static String REFSEQ_NAME = "name"; - protected final static String REFSEQ_NAME2 = "name2"; - - protected final static String REFSEQ_POSITION_TYPE = "positionType"; - protected final static String REFSEQ_CDS = "CDS"; - - protected final static String REFSEQ_STRAND = "transcriptStrand"; - protected final static String REFSEQ_POS_STRAND = "+"; - protected final static String REFSEQ_NEG_STRAND = "-"; - - protected final static String REFSEQ_CODON_COORD = "codonCoord"; - protected final static String REFSEQ_CODING_FRAME = "frame"; - - protected final static String REFSEQ_REF_CODON = "referenceCodon"; - protected final static String REFSEQ_REF_AA = "referenceAA"; - - protected final static String REFSEQ_ALT_BASE = "haplotypeAlternate"; - - protected final static String REFSEQ_VARIANT_CODON = "variantCodon"; - protected final static String REFSEQ_VARIANT_AA = "variantAA"; - protected final static String REFSEQ_CHANGES_AA = "changesAA"; - protected final static String REFSEQ_FUNCTIONAL_CLASS = "functionalClass"; - protected final static String REFSEQ_PROTEIN_COORD_DESCRIPTION = "proteinCoordStr"; - - protected final static String REFSEQ_CODING_ANNOTATIONS = "codingVariants"; - protected final static String REFSEQ_NUM_AA_CHANGES = "numAAchanges"; - protected final static String REFSEQ_HAS_MULT_AA_CHANGES = "alleleHasMultAAchanges"; - - public void initialize() { - rodNames = new LinkedList(); - rodNames.add(VARIANT_ROD_NAME); - - locParser = getToolkit().getGenomeLocParser(); - MNPstartToStops = new TreeMap>(); // sorted by start sites - - initializeVcfWriter(); - - locusToRefSeqFeatures = new LocusToFeatures(); - } - - private void initializeVcfWriter() { - sortingWriter = new ManualSortingVCFWriter(writer); - writer = sortingWriter; - - // setup the header fields: - Set hInfo = new HashSet(); - hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); - hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); - - Map rodNameToHeader = getVCFHeadersFromRods(getToolkit(), rodNames); - writer.writeHeader(new VCFHeader(hInfo, new TreeSet(rodNameToHeader.get(rodNames.get(0)).getGenotypeSamples()))); - } - - public boolean generateExtendedEvents() { - return false; - } - - public Integer reduceInit() { - return 0; - } - - /** - * For each site of interest, annotate it if it's a MNP. - * - * @param tracker the meta-data tracker - * @param ref the reference base - * @param context the context for the given locus - * @return count of MNPs observed - */ - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (tracker == null) - return null; - - int numMNPsObserved = 0; - GenomeLoc curLocus = ref.getLocus(); - clearOldLocusFeatures(curLocus); - - boolean requireStartHere = false; // see EVERY site of the MNP - boolean takeFirstOnly = false; // take as many entries as the VCF file has - for (VariantContext vc : tracker.getVariantContexts(ref, rodNames, null, context.getLocation(), requireStartHere, takeFirstOnly)) { - GenomeLoc vcLoc = VariantContextUtils.getLocation(locParser, vc); - boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart(); - boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop(); - - if (vc.isMNP()) { - logger.debug("Observed MNP at " + vcLoc); - - if (isChrM(vc)) { - if (atStartOfVc) { - logger.warn("Skipping mitochondrial MNP at " + vcLoc + " due to complexity of coding table [need to know if first codon, etc.]..."); - writeVCF(vc); - } - continue; - } - - GenomeLoc stopLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStop()); - final List refSeqRODs = tracker.getReferenceMetaData(REFSEQ_ROD_NAME); - for (Object refSeqObject : refSeqRODs) { - AnnotatorInputTableFeature refSeqAnnotation = (AnnotatorInputTableFeature) refSeqObject; - locusToRefSeqFeatures.putLocusFeatures(curLocus, refSeqAnnotation, stopLoc); - } - - if (atStartOfVc) { // MNP is starting here, so register that we're waiting for it - Set stopLocs = MNPstartToStops.get(curLocus); - if (stopLocs == null) { - stopLocs = new HashSet(); - MNPstartToStops.put(curLocus, stopLocs); - } - stopLocs.add(stopLoc); - } - - if (atEndOfVc) { - numMNPsObserved++; // only count a MNP at its stop site - logger.debug("Observed end of MNP at " + curLocus); - logger.debug("Current list of per-locus features\n" + locusToRefSeqFeatures); - - Map MNPannotations = annotateMNP(vc); - MNPannotations.putAll(RefSeqDataParser.removeRefSeqAttributes(vc.getAttributes())); // remove any RefSeq INFO, since adding it in more thoroughly here - vc = VariantContext.modifyAttributes(vc, MNPannotations); - writeVCF(vc); - - GenomeLoc startLoc = locParser.createGenomeLoc(curLocus.getContig(), vcLoc.getStart()); - Set stopLocs = MNPstartToStops.get(startLoc); - if (stopLocs != null) { // otherwise, just removed stopLocs due to another MNP that has the same (start, stop) - stopLocs.remove(stopLoc); - if (stopLocs.isEmpty()) // no longer waiting for startLoc - MNPstartToStops.remove(startLoc); - } - } - } - else if (atStartOfVc && !emitOnlyMNPs) {// only want to write other VariantContexts records once (where they start): - writeVCF(vc); - } - } - - Integer mostUpstreamWritableLoc = null; - if (!MNPstartToStops.isEmpty()) { - GenomeLoc waitingForLoc = MNPstartToStops.entrySet().iterator().next().getKey(); - mostUpstreamWritableLoc = waitingForLoc.getStart() - 1; - } - sortingWriter.setmostUpstreamWritableLocus(mostUpstreamWritableLoc); - - return numMNPsObserved; - } - - private static boolean isChrM(final VariantContext vc) { - return vc.getChr().equals("chrM") || vc.getChr().equals("MT"); - } - - private Map annotateMNP(VariantContext vc) { - Map annotations = new HashMap(); - - RefSeqNameToFeatures nameToPositionalFeatures = new RefSeqNameToFeatures(vc); - MNPannotationKeyBuilder kb = new MNPannotationKeyBuilder(nameToPositionalFeatures); - - for (Map.Entry nameToFeatureEntry : nameToPositionalFeatures.entrySet()) { - String featureName = nameToFeatureEntry.getKey(); - RefSeqFeatureList feature = nameToFeatureEntry.getValue(); - CodonAnnotationsForAltAlleles codonAnnotationsForAlleles = new CodonAnnotationsForAltAlleles(vc, feature); - - annotations.put(kb.getKey(REFSEQ_CODING_ANNOTATIONS), codonAnnotationsForAlleles.getCodonAnnotationsString()); - annotations.put(kb.getKey(REFSEQ_NUM_AA_CHANGES), codonAnnotationsForAlleles.getNumAAchangesString()); - annotations.put(kb.getKey(REFSEQ_HAS_MULT_AA_CHANGES), codonAnnotationsForAlleles.hasAlleleWithMultipleAAchanges); - annotations.put(kb.getKey(REFSEQ_NAME), featureName); - annotations.put(kb.getKey(REFSEQ_NAME2), feature.name2); - annotations.put(kb.getKey(REFSEQ_POSITION_TYPE), REFSEQ_CDS); - annotations.put(kb.getKey(REFSEQ_STRAND), (feature.positiveStrand ? REFSEQ_POS_STRAND : REFSEQ_NEG_STRAND)); - annotations.put(kb.getKey(REFSEQ_CODON_COORD), feature.getCodonCoordString()); - - kb.incrementFeatureIndex(); - } - - return annotations; - } - - private static class MNPannotationKeyBuilder { - private int featureIndex; - private boolean multipleEntries; - - public MNPannotationKeyBuilder(RefSeqNameToFeatures nameToPositionalFeatures) { - this.featureIndex = 1; - this.multipleEntries = nameToPositionalFeatures.nameToFeatures.size() > 1; - } - - public void incrementFeatureIndex() { - featureIndex++; - } - - public String getKey(String type) { - String annotationKey = MNP_ANNOTATION_KEY_PREFIX + type; - if (multipleEntries) - annotationKey += "_" + featureIndex; - return annotationKey; - } - } - - private static byte[] ByteArrayToPrimitive(Byte[] nonNullArray) { - byte[] primArray = new byte[nonNullArray.length]; - - for (int i = 0; i < nonNullArray.length; i++) { - if (nonNullArray[i] == null) - throw new ReviewedStingException("nonNullArray[i] == null"); - primArray[i] = nonNullArray[i]; - } - - return primArray; - } - - private void clearOldLocusFeatures(GenomeLoc curLoc) { - Iterator> locusFeaturesIt = locusToRefSeqFeatures.entrySet().iterator(); - while (locusFeaturesIt.hasNext()) { - Map.Entry locusFeaturesEntry = locusFeaturesIt.next(); - if (curLoc.isPast(locusFeaturesEntry.getValue().getFurthestLocusUsingFeatures())) - locusFeaturesIt.remove(); - } - } - - public Integer reduce(Integer count, Integer total) { - if (count != null) - total = total + count; - - return total; - } - - /** - * @param result the number of MNPs processed. - */ - public void onTraversalDone(Integer result) { - System.out.println("Number of MNPs observed: " + result); - writer.close(); - } - - private void writeVCF(VariantContext vc) { - WriteVCF.writeVCF(vc, writer, logger); - } - - /* - Inner classes: - */ - - // Maps: RefSeq entry name -> features for ALL positions of a particular VariantContext MNP: - - private class RefSeqNameToFeatures { - private Map nameToFeatures; - - public RefSeqNameToFeatures(VariantContext vc) { - this.nameToFeatures = new HashMap(); - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (int i = 0; i < MNPlength; i++) { - int genomicPosition = MNPstart + i; - GenomeLoc posLoc = locParser.createGenomeLoc(vc.getChr(), genomicPosition); - - PositionalRefSeqFeatures locFeatures = locusToRefSeqFeatures.getLocusFeatures(posLoc); - if (locFeatures == null) // no features for posLoc - continue; - - for (Map.Entry nameToFeatureEntry : locFeatures.entrySet()) { - String name = nameToFeatureEntry.getKey(); - PositionalRefSeqFeature posFeature = nameToFeatureEntry.getValue(); - - RefSeqFeatureList featureList = nameToFeatures.get(name); - if (featureList == null) { - featureList = new RefSeqFeatureList(MNPlength); - nameToFeatures.put(name, featureList); - } - featureList.updateFeatureAtPosition(i, posFeature); - } - } - } - - public Set> entrySet() { - return nameToFeatures.entrySet(); - } - } - - // For a particular RefSeq entry, contains the features for ALL positions of a particular VariantContext MNP - - private static class RefSeqFeatureList { - private final static String CODON_FRAME_START = "("; - private final static String CODON_FRAME_END = ")"; - private final static String CODON_DELIM = "|"; - - private CodingRefSeqFeature[] refSeqFeatures; - private String name2; - private Boolean positiveStrand; - - private Map> codonToIndices; // Map of: codon index -> MNP indices that refer to codon - - public RefSeqFeatureList(int MNPlength) { - this.refSeqFeatures = new CodingRefSeqFeature[MNPlength]; - for (int i = 0; i < MNPlength; i++) - this.refSeqFeatures[i] = null; - - this.name2 = null; - this.positiveStrand = null; - this.codonToIndices = new TreeMap>(); - } - - public void updateFeatureAtPosition(int index, PositionalRefSeqFeature feature) { - if (name2 == null) { - name2 = feature.name2; - positiveStrand = feature.positiveStrand; - } - else if (!name2.equals(feature.name2) || positiveStrand != feature.positiveStrand) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + feature); - } - - CodingRefSeqFeature crsf = new CodingRefSeqFeature(feature); - refSeqFeatures[index] = crsf; - - List indicesWithCodon = codonToIndices.get(crsf.codonCoord); - if (indicesWithCodon == null) { - indicesWithCodon = new LinkedList(); - codonToIndices.put(crsf.codonCoord, indicesWithCodon); - } - indicesWithCodon.add(index); - } - - public Set>> codonIndicesEntrySet() { - return codonToIndices.entrySet(); - } - - public String getCodonCoordString() { - StringBuilder sb = new StringBuilder(); - - for (int i = 0; i < refSeqFeatures.length; i++) { - CodingRefSeqFeature crsf = refSeqFeatures[i]; - if (crsf != null) - sb.append(crsf.codonCoord).append(CODON_FRAME_START).append(crsf.codingFrame).append(CODON_FRAME_END); - if (i < refSeqFeatures.length - 1) - sb.append(CODON_DELIM); - } - - return sb.toString(); - } - } - - private static class CodingRefSeqFeature { - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - public CodingRefSeqFeature(PositionalRefSeqFeature feature) { - this.codonCoord = feature.codonCoord; - this.codingFrame = feature.codingFrame; - this.referenceCodon = feature.referenceCodon.toUpperCase(); - this.referenceAA = feature.referenceAA; - } - } - - private static class CodonAnnotationsForAltAlleles { - protected final static int MIN_CODON_INDEX = 0; - protected final static int NUM_CODON_INDICES = 3; - private final static String CODON_ANNOTATION_DELIM = ","; - - private List alleleAnnotations; - private int[] alleleToNumAAchanges; - private boolean hasAlleleWithMultipleAAchanges; - - public CodonAnnotationsForAltAlleles(VariantContext vc, RefSeqFeatureList feature) { - this.alleleAnnotations = new LinkedList(); - - Set altAlleles = vc.getAlternateAlleles(); - int numAltAlleles = altAlleles.size(); - this.alleleToNumAAchanges = new int[numAltAlleles]; - for (int i = 0; i < numAltAlleles; i++) - this.alleleToNumAAchanges[i] = 0; - - int MNPstart = vc.getStart(); - int MNPstop = vc.getEnd(); - int MNPlength = MNPstop - MNPstart + 1; - - for (Map.Entry> codonToIndicesEntry : feature.codonIndicesEntrySet()) { - int codonIndex = codonToIndicesEntry.getKey(); - List indices = codonToIndicesEntry.getValue(); - if (indices.isEmpty()) - throw new ReviewedStingException("indices should not exist if it's empty!"); - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (feature.refSeqFeatures[index].codonCoord != codonIndex) - throw new ReviewedStingException("LOGICAL ERROR: feature.refSeqFeatures[index].codonCoord != codonIndex"); - if (frame < MIN_CODON_INDEX || frame >= NUM_CODON_INDICES) - throw new UserException("RefSeq codon frame not one of {0,1,2}"); - } - CodingRefSeqFeature firstFeatureForCodon = feature.refSeqFeatures[indices.get(0)]; - String refCodon = firstFeatureForCodon.referenceCodon; - - SingleCodonAnnotationsForAlleles codonAnnotation = new SingleCodonAnnotationsForAlleles(codonIndex, altAlleles, MNPlength, refCodon, firstFeatureForCodon, indices, feature); - alleleAnnotations.add(codonAnnotation); - - // From a single codon, summarize the data for ALL alleles: - for (int i = 0; i < numAltAlleles; i++) { - if (codonAnnotation.annotationsForAlleles[i].codonFunc.changesAA) { - alleleToNumAAchanges[i]++; - if (alleleToNumAAchanges[i] > 1) - this.hasAlleleWithMultipleAAchanges = true; - } - } - } - } - - public String getCodonAnnotationsString() { - StringBuilder sb = new StringBuilder(); - - int index = 0; - for (SingleCodonAnnotationsForAlleles codonToAlleles : alleleAnnotations) { - sb.append(codonToAlleles); - if (index < alleleAnnotations.size() - 1) - sb.append(CODON_ANNOTATION_DELIM); - index++; - } - - return sb.toString(); - } - - public String getNumAAchangesString() { - StringBuilder sb = new StringBuilder(); - - for (int index = 0; index < alleleToNumAAchanges.length; index++) { - sb.append(alleleToNumAAchanges[index]); - if (index < alleleToNumAAchanges.length - 1) - sb.append(SingleCodonAnnotationsForAlleles.ALLELE_ANNOTATION_DELIM); - } - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAlleles { - private final static String CODON_MAP_SYMBOL = "->"; - private final static String CODON_ANNOTATION_START = "["; - private final static String CODON_ANNOTATION_END = "]"; - private final static String REF_CODON_INFO_DELIM = "|"; - private final static String ALLELE_ANNOTATION_DELIM = ","; - private final static String ASSIGNMENT = ":"; - - private int codonIndex; - private String refCodon; - private String refAA; - - private SingleCodonAnnotationsForAllele[] annotationsForAlleles; - - public SingleCodonAnnotationsForAlleles(int codonIndex, Collection altAlleles, int MNPlength, String refCodon, CodingRefSeqFeature firstFeatureForCodon, List indices, RefSeqFeatureList feature) { - if (refCodon.length() != CodonAnnotationsForAltAlleles.NUM_CODON_INDICES) - throw new UserException("RefSeq reference codon " + refCodon + " is not of length " + CodonAnnotationsForAltAlleles.NUM_CODON_INDICES); - - AminoAcid refAA = AminoAcidTable.getEukaryoticAA(refCodon); - if (!refAA.getCode().equals(firstFeatureForCodon.referenceAA)) - throw new UserException("RefSeq: translated reference codon= " + refAA + " != " + firstFeatureForCodon.referenceAA + " = reference AA"); - - this.codonIndex = codonIndex; - this.refCodon = refCodon; - this.refAA = refAA.getCode(); - this.annotationsForAlleles = new SingleCodonAnnotationsForAllele[altAlleles.size()]; - - int altInd = 0; - for (Allele altAllele : altAlleles) { - if (altAllele.length() != MNPlength) - throw new ReviewedStingException("length(altAllele) != length(MNP)"); - byte[] altBases = altAllele.getBases(); - - Byte[] variantCodonArr = new Byte[CodonAnnotationsForAltAlleles.NUM_CODON_INDICES]; - for (int i = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; i < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; i++) - variantCodonArr[i] = null; - - for (int index : indices) { - int frame = feature.refSeqFeatures[index].codingFrame; - if (variantCodonArr[frame] != null) - throw new UserException("RefSeq assigns codon " + codonIndex + " twice at same frame: " + frame); - - byte base = altBases[index]; - if (!feature.positiveStrand) // negative strand codon - base = BaseUtils.simpleComplement(base); - - variantCodonArr[frame] = base; - } - - /* For missing frames, there MUST exist AT LEAST one index that refers to this codon, - so use it to derive the missing bases [ALREADY complemented if on the negative strand]: - */ - for (int frame = CodonAnnotationsForAltAlleles.MIN_CODON_INDEX; frame < CodonAnnotationsForAltAlleles.NUM_CODON_INDICES; frame++) { - if (variantCodonArr[frame] == null) - variantCodonArr[frame] = (byte) refCodon.charAt(frame); - } - String variantCodon = new String(ByteArrayToPrimitive(variantCodonArr)).toUpperCase(); - - SingleCodonAnnotationsForAllele alleleAnnotation = new SingleCodonAnnotationsForAllele(variantCodon, refCodon, refAA, codonIndex); - annotationsForAlleles[altInd] = alleleAnnotation; - altInd++; - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(codonIndex).append(CODON_MAP_SYMBOL).append(CODON_ANNOTATION_START); - sb.append(REFSEQ_REF_CODON).append(ASSIGNMENT).append(refCodon).append(REF_CODON_INFO_DELIM); - sb.append(REFSEQ_REF_AA).append(ASSIGNMENT).append(refAA).append(REF_CODON_INFO_DELIM); - - int index = 0; - for (SingleCodonAnnotationsForAllele annotation : annotationsForAlleles) { - sb.append(annotation); - if (index < annotationsForAlleles.length - 1) - sb.append(ALLELE_ANNOTATION_DELIM); - index++; - } - sb.append(CODON_ANNOTATION_END); - - return sb.toString(); - } - } - - private static class SingleCodonAnnotationsForAllele { - private final static String ALLELE_START = "{"; - private final static String ALLELE_END = "}"; - private final static String CODON_INFO_DELIM = "|"; - private final static String ASSIGNMENT = ":"; - private final static String MNP_DEPENDENT_AA = "MNPdependentAA"; - - private CodonFunction codonFunc; - private String proteinCoordStr; - private boolean MNPdependentAA; - private String originalAA; - - public SingleCodonAnnotationsForAllele(String variantCodon, String refCodon, AminoAcid refAA, int codonIndex) { - this.codonFunc = new CodonFunction(variantCodon, refCodon, refAA); - this.proteinCoordStr = "p." + refAA.getLetter() + codonIndex + codonFunc.variantAA.getLetter(); - - int refCodonLength = refCodon.length(); - if (codonFunc.variantCodon.length() != refCodonLength) - throw new ReviewedStingException("codonFunc.variantCodon.length() != refCodonLength, but ALREADY checked that they're both 3"); - - this.MNPdependentAA = true; - this.originalAA = "("; - for (int i = 0; i < refCodonLength; i++) { - // Take [0,i-1] and [i+1, end] from refCodon, and i from variantCodon: - String singleBaseChangeCodon = refCodon.substring(0, i) + variantCodon.substring(i, i+1) + refCodon.substring(i+1, refCodonLength); - CodonFunction singleBaseChangeCodonFunc = new CodonFunction(singleBaseChangeCodon, refCodon, refAA); - if (singleBaseChangeCodonFunc.variantAA.equals(codonFunc.variantAA)) { - this.MNPdependentAA = false; - this.originalAA = ""; - break; - } - - this.originalAA = this.originalAA + "" + singleBaseChangeCodonFunc.variantAA.getLetter(); - if (i < refCodonLength - 1) - this.originalAA = this.originalAA + ","; - } - - if (this.MNPdependentAA) - this.originalAA = this.originalAA + ")"; - } - - private static class CodonFunction { - private String variantCodon; - private AminoAcid variantAA; - private boolean changesAA; - private String functionalClass; - - public CodonFunction(String variantCodon, String refCodon, AminoAcid refAA) { - this.variantCodon = variantCodon; - this.variantAA = AminoAcidTable.getEukaryoticAA(this.variantCodon); - this.changesAA = !refAA.equals(variantAA); - - if (!this.variantCodon.equals(refCodon)) { - if (changesAA) { - if (variantAA.isStop()) { - functionalClass = "nonsense"; - } - else if (refAA.isStop()) { - functionalClass = "readthrough"; - } - else { - functionalClass = "missense"; - } - } - else { // the same aa: - functionalClass = "silent"; - } - } - else { // the same codon: - functionalClass = "no_change"; - } - } - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - - sb.append(ALLELE_START); - sb.append(REFSEQ_VARIANT_CODON).append(ASSIGNMENT).append(codonFunc.variantCodon).append(CODON_INFO_DELIM); - sb.append(REFSEQ_VARIANT_AA).append(ASSIGNMENT).append(codonFunc.variantAA.getCode()).append(CODON_INFO_DELIM); - sb.append(REFSEQ_CHANGES_AA).append(ASSIGNMENT).append(codonFunc.changesAA).append(CODON_INFO_DELIM); - sb.append(REFSEQ_FUNCTIONAL_CLASS).append(ASSIGNMENT).append(codonFunc.functionalClass).append(CODON_INFO_DELIM); - sb.append(REFSEQ_PROTEIN_COORD_DESCRIPTION).append(ASSIGNMENT).append(proteinCoordStr).append(CODON_INFO_DELIM); - sb.append(MNP_DEPENDENT_AA).append(ASSIGNMENT).append(MNPdependentAA).append(originalAA); - sb.append(ALLELE_END); - - return sb.toString(); - } - } -} - - -// External classes: - -class LocusToFeatures { - private Map locusToFeatures; - - public LocusToFeatures() { - this.locusToFeatures = new TreeMap(); - } - - public PositionalRefSeqFeatures getLocusFeatures(GenomeLoc loc) { - return locusToFeatures.get(loc); - } - - public void putLocusFeatures(GenomeLoc loc, AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - PositionalRefSeqFeatures locFeatures = locusToFeatures.get(loc); - if (locFeatures == null) { - locFeatures = new PositionalRefSeqFeatures(locusUsingThis); - locusToFeatures.put(loc, locFeatures); - } - locFeatures.putFeature(refSeqAnnotation, locusUsingThis); - } - - public Set> entrySet() { - return locusToFeatures.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry locFeatures : entrySet()) { - GenomeLoc loc = locFeatures.getKey(); - PositionalRefSeqFeatures features = locFeatures.getValue(); - sb.append("Locus: ").append(loc).append("\n").append(features); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeatures { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME, AnnotateMNPsWalker.REFSEQ_POSITION_TYPE}; - - private Map nameToFeature; - private GenomeLoc furthestLocusUsingFeatures; - - public PositionalRefSeqFeatures(GenomeLoc locusUsingThis) { - this.nameToFeature = new HashMap(); - this.furthestLocusUsingFeatures = locusUsingThis; - } - - public void putFeature(AnnotatorInputTableFeature refSeqAnnotation, GenomeLoc locusUsingThis) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - - if (locusUsingThis.isPast(furthestLocusUsingFeatures)) - furthestLocusUsingFeatures = locusUsingThis; - - String posType = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_POSITION_TYPE); - if (!posType.equals(AnnotateMNPsWalker.REFSEQ_CDS)) // only interested in coding sequence annotations - return; - - PositionalRefSeqFeature newLocusFeature = new PositionalRefSeqFeature(refSeqAnnotation); - - String refSeqName = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME); - PositionalRefSeqFeature locusFeature = nameToFeature.get(refSeqName); - if (locusFeature == null) { - locusFeature = newLocusFeature; - nameToFeature.put(refSeqName, locusFeature); - } - else if (!locusFeature.equals(newLocusFeature)) { - throw new UserException("Inconsistency between previous RefSeq entry and: " + refSeqAnnotation); - } - - locusFeature.updateFeature(refSeqAnnotation); - } - - public GenomeLoc getFurthestLocusUsingFeatures() { - return furthestLocusUsingFeatures; - } - - public Set> entrySet() { - return nameToFeature.entrySet(); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - for (Map.Entry nameFeatureEntry : entrySet()) { - String name = nameFeatureEntry.getKey(); - PositionalRefSeqFeature feature = nameFeatureEntry.getValue(); - sb.append(name).append(" -> [").append(feature).append("]\n"); - } - - return sb.toString(); - } -} - -class PositionalRefSeqFeature { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_NAME2, AnnotateMNPsWalker.REFSEQ_STRAND, - AnnotateMNPsWalker.REFSEQ_CODON_COORD, AnnotateMNPsWalker.REFSEQ_CODING_FRAME, - AnnotateMNPsWalker.REFSEQ_REF_CODON, AnnotateMNPsWalker.REFSEQ_REF_AA}; - - protected String name2; - protected boolean positiveStrand; - protected int codonCoord; - protected int codingFrame; - protected String referenceCodon; - protected String referenceAA; - - private Map baseToAnnotations; - - public PositionalRefSeqFeature(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.name2 = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_NAME2); - this.positiveStrand = (refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_STRAND).equals(AnnotateMNPsWalker.REFSEQ_POS_STRAND)); - this.codonCoord = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODON_COORD)); - this.codingFrame = Integer.parseInt(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CODING_FRAME)); - this.referenceCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_CODON); - this.referenceAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_REF_AA); - - this.baseToAnnotations = new HashMap(); - } - - public boolean equals(PositionalRefSeqFeature that) { - return this.name2.equals(that.name2) && this.positiveStrand == that.positiveStrand && this.codonCoord == that.codonCoord && this.codingFrame == that.codingFrame - && this.referenceCodon.equals(that.referenceCodon) && this.referenceAA.equals(that.referenceAA); - } - - public void updateFeature(AnnotatorInputTableFeature refSeqAnnotation) { - if (!refSeqAnnotation.containsColumnName(AnnotateMNPsWalker.REFSEQ_ALT_BASE)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + AnnotateMNPsWalker.REFSEQ_ALT_BASE); - String base = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_ALT_BASE); - - baseToAnnotations.put(base, new BaseAnnotations(refSeqAnnotation)); - } - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("name2= ").append(name2); - sb.append(", positiveStrand= ").append(positiveStrand); - sb.append(", codonCoord= ").append(codonCoord); - sb.append(", codingFrame= ").append(codingFrame); - sb.append(", referenceCodon= ").append(referenceCodon); - sb.append(", referenceAA= ").append(referenceAA); - - sb.append(", baseAnnotations= {"); - for (Map.Entry baseToAnnotationsEntry : baseToAnnotations.entrySet()) { - String base = baseToAnnotationsEntry.getKey(); - BaseAnnotations annotations = baseToAnnotationsEntry.getValue(); - sb.append(" ").append(base).append(" -> {").append(annotations).append("}"); - } - sb.append(" }"); - - return sb.toString(); - } -} - -class BaseAnnotations { - private final static String[] REQUIRE_COLUMNS = - {AnnotateMNPsWalker.REFSEQ_VARIANT_CODON, AnnotateMNPsWalker.REFSEQ_VARIANT_AA, - AnnotateMNPsWalker.REFSEQ_CHANGES_AA, AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS, - AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION}; - - protected String variantCodon; - protected String variantAA; - protected boolean changesAA; - protected String functionalClass; - protected String proteinCoordStr; - - public BaseAnnotations(AnnotatorInputTableFeature refSeqAnnotation) { - for (String column : REQUIRE_COLUMNS) { - if (!refSeqAnnotation.containsColumnName(column)) - throw new UserException("In RefSeq: " + refSeqAnnotation + " Missing column " + column); - } - this.variantCodon = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_CODON); - this.variantAA = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_VARIANT_AA); - this.changesAA = Boolean.parseBoolean(refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_CHANGES_AA)); - this.functionalClass = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_FUNCTIONAL_CLASS); - this.proteinCoordStr = refSeqAnnotation.getColumnValue(AnnotateMNPsWalker.REFSEQ_PROTEIN_COORD_DESCRIPTION); - } - - - public String toString() { // INTERNAL use only - StringBuilder sb = new StringBuilder(); - - sb.append("variantCodon= ").append(variantCodon); - sb.append(", variantAA= ").append(variantAA); - sb.append(", changesAA= ").append(changesAA); - sb.append(", functionalClass= ").append(functionalClass); - sb.append(", proteinCoordStr= ").append(proteinCoordStr); - - return sb.toString(); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java index 298d8d6c8..83216d214 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeAndMatchHaplotypes.java @@ -91,7 +91,7 @@ public class MergeAndMatchHaplotypes extends RodWalker { } VariantContext newvc = new VariantContext(SOURCE_NAME, pbt.getChr(), pbt.getStart(), pbt.getStart(), pbt.getAlleles(), genotypes, pbt.getNegLog10PError(), pbt.getFilters(), pbt.getAttributes()); - vcfWriter.add(newvc, ref.getBase()); + vcfWriter.add(newvc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java index b0491a281..53cfaa3a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/MergeSegregatingAlternateAllelesVCFWriter.java @@ -118,7 +118,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { innerWriter.close(); } - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { if (useSingleSample != null) { // only want to output context for one sample Genotype sampGt = vc.getGenotype(useSingleSample); if (sampGt != null) // TODO: subContextFromGenotypes() does not handle any INFO fields [AB, HaplotypeScore, MQ, etc.]. Note that even SelectVariants.subsetRecord() only handles AC,AN,AF, and DP! @@ -138,11 +138,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (curVcIsNotFiltered) { // still need to wait before can release vc logger.debug("Waiting for new variant " + VariantContextUtils.getLocation(genomeLocParser, vc)); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } else if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("DIRECTLY output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - innerWriter.add(vc, refBase); + innerWriter.add(vc); } } else { // waiting to merge vcfrWaitingToMerge @@ -151,7 +151,7 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { if (!curVcIsNotFiltered) { if (!emitOnlyMergedRecords) { // filtered records are never merged logger.debug("Caching unprocessed output " + VariantContextUtils.getLocation(genomeLocParser, vc)); - filteredVcfrList.add(new VCFRecord(vc, refBase, false)); + filteredVcfrList.add(new VCFRecord(vc, false)); } } else { // waiting to merge vcfrWaitingToMerge, and curVcIsNotFiltered. So, attempt to merge them: @@ -188,14 +188,14 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { addedAttribs.putAll(mergedVc.getAttributes()); mergedVc = VariantContext.modifyAttributes(mergedVc, addedAttribs); - vcfrWaitingToMerge = new VCFRecord(mergedVc, vcfrWaitingToMerge.refBase, true); + vcfrWaitingToMerge = new VCFRecord(mergedVc, true); numMergedRecords++; } } if (!mergedRecords) { stopWaitingToMerge(); - vcfrWaitingToMerge = new VCFRecord(vc, refBase, false); + vcfrWaitingToMerge = new VCFRecord(vc, false); } logger.debug("Merged? = " + mergedRecords); } @@ -210,11 +210,11 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { } if (!emitOnlyMergedRecords || vcfrWaitingToMerge.resultedFromMerge) - innerWriter.add(vcfrWaitingToMerge.vc, vcfrWaitingToMerge.refBase); + innerWriter.add(vcfrWaitingToMerge.vc); vcfrWaitingToMerge = null; for (VCFRecord vcfr : filteredVcfrList) - innerWriter.add(vcfr.vc, vcfr.refBase); + innerWriter.add(vcfr.vc); filteredVcfrList.clear(); } @@ -257,12 +257,10 @@ public class MergeSegregatingAlternateAllelesVCFWriter implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; public boolean resultedFromMerge; - public VCFRecord(VariantContext vc, byte refBase, boolean resultedFromMerge) { + public VCFRecord(VariantContext vc, boolean resultedFromMerge) { this.vc = vc; - this.refBase = refBase; this.resultedFromMerge = resultedFromMerge; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index b24437c4a..992e4d9d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -9,12 +9,14 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; /** @@ -29,37 +31,75 @@ import java.util.*; * begin. */ public class PhaseByTransmission extends RodWalker { - @Argument(shortName="f", fullName="familyPattern", required=true, doc="Pattern for the family structure (usage: mom+dad=child)") - public String familyStr = null; - - @Argument(shortName="nofilters", fullName="disableFilters", required=false, doc="Disable filters for sites where the phase can't be determined, where the parental origin of the alleles is ambiguous (i.e. everyone is heterozygous), or Mendelian violations") - public Boolean noFilters = false; + @Argument(shortName="f", fullName="familySpec", required=true, doc="Patterns for the family structure (usage: mom+dad=child). Specify several trios by supplying this argument many times and/or a file containing many patterns.") + public ArrayList familySpecs = null; @Output protected VCFWriter vcfWriter = null; - private String SAMPLE_NAME_MOM; - private String SAMPLE_NAME_DAD; - private String SAMPLE_NAME_CHILD; - private final String ROD_NAME = "variant"; - private final String AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME = "AmbiguousAlleleOrigin"; - private final String INSUFFICIENT_DATA_FILTER_NAME = "InsufficientInformation"; - private final String MENDELIAN_VIOLATION_FILTER_NAME = "MendelianViolation"; private final String TRANSMISSION_PROBABILITY_TAG_NAME = "TP"; private final String SOURCE_NAME = "PhaseByTransmission"; private final Double MENDELIAN_VIOLATION_PRIOR = 1e-8; + private class Trio { + private String mother; + private String father; + private String child; + + public Trio(String mother, String father, String child) { + this.mother = mother; + this.father = father; + this.child = child; + } + + public Trio(String familySpec) { + String[] pieces = familySpec.split("[\\+\\=]"); + + this.mother = pieces[0]; + this.father = pieces[1]; + this.child = pieces[2]; + } + + public String getMother() { return mother; } + public String getFather() { return father; } + public String getChild() { return child; } + } + + private ArrayList trios = new ArrayList(); + + public ArrayList getFamilySpecsFromCommandLineInput(ArrayList familySpecs) { + if (familySpecs != null) { + // Let's first go through the list and see if we were given any files. We'll add every entry in the file to our + // spec list set, and treat the entries as if they had been specified on the command line. + ArrayList specs = new ArrayList(); + for (String familySpec : familySpecs) { + File specFile = new File(familySpec); + + try { + XReadLines reader = new XReadLines(specFile); + + List lines = reader.readLines(); + for (String line : lines) { + specs.add(new Trio(line)); + } + } catch (FileNotFoundException e) { + specs.add(new Trio(familySpec)); // not a file, so must be a family spec + } + } + + return specs; + } + + return new ArrayList(); + } + /** * Parse the familial relationship specification, and initialize VCF writer */ public void initialize() { - String[] pieces = familyStr.split("[\\+\\=]"); - - SAMPLE_NAME_MOM = pieces[0]; - SAMPLE_NAME_DAD = pieces[1]; - SAMPLE_NAME_CHILD = pieces[2]; + trios = getFamilySpecsFromCommandLineInput(familySpecs); ArrayList rodNames = new ArrayList(); rodNames.add(ROD_NAME); @@ -67,34 +107,11 @@ public class PhaseByTransmission extends RodWalker { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); - if (vcfSamples.size() != 3) { - throw new UserException("File to phase by transmission contains more than three samples. This walker only" + - "accepts VCFs with three samples, so that the meaning of the applied filters is" + - "unambiguous."); - } - - if (!vcfSamples.contains(SAMPLE_NAME_MOM) || !vcfSamples.contains(SAMPLE_NAME_DAD) || !vcfSamples.contains(SAMPLE_NAME_CHILD)) { - throw new UserException("One or more of the samples specified in the familyPattern argument is not present" + - "in this file. Please supply a VCF file that contains only three samples: the" + - "mother, the father, and the child"); - } - - Set samples = new TreeSet(); - samples.add(SAMPLE_NAME_MOM); - samples.add(SAMPLE_NAME_DAD); - samples.add(SAMPLE_NAME_CHILD); - Set headerLines = new HashSet(); headerLines.addAll(VCFUtils.getHeaderFields(this.getToolkit())); - - if (!noFilters) { - headerLines.add(new VCFFilterHeaderLine(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME, "The parental origin of each of the child's allele cannot be determined (ie everyone is heterozygous)")); - headerLines.add(new VCFFilterHeaderLine(INSUFFICIENT_DATA_FILTER_NAME, "The phase of the child's genotype cannot be determined (ie someone is a no-call)")); - headerLines.add(new VCFFilterHeaderLine(MENDELIAN_VIOLATION_FILTER_NAME, "No combination of the parents' alleles can yield the child's genotype (ie a possible Mendelian violation)")); - } - - headerLines.add(new VCFInfoHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + headerLines.add(new VCFFormatHeaderLine(TRANSMISSION_PROBABILITY_TAG_NAME, 1, VCFHeaderLineType.Float, "Probability that the phase is correct given that the genotypes are correct")); + headerLines.add(new VCFHeaderLine("source", SOURCE_NAME)); + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); } private double computeTransmissionLikelihoodOfGenotypeConfiguration(Genotype mom, Genotype dad, Genotype child) { @@ -211,68 +228,54 @@ public class PhaseByTransmission extends RodWalker { return finalGenotypes; } - private VariantContext phaseTrioGenotypes(VariantContext vc) { - Genotype mom = vc.getGenotype(SAMPLE_NAME_MOM); - Genotype dad = vc.getGenotype(SAMPLE_NAME_DAD); - Genotype child = vc.getGenotype(SAMPLE_NAME_CHILD); - - Set filters = new HashSet(); - filters.addAll(vc.getFilters()); - - Map attributes = new HashMap(); - attributes.putAll(vc.getAttributes()); - attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, 0.0); - + private ArrayList phaseTrioGenotypes(Allele ref, Allele alt, Genotype mother, Genotype father, Genotype child) { ArrayList finalGenotypes = new ArrayList(); - finalGenotypes.add(mom); - finalGenotypes.add(dad); + finalGenotypes.add(mother); + finalGenotypes.add(father); finalGenotypes.add(child); - if (!mom.isCalled() || !dad.isCalled() || !child.isCalled()) { - filters.add(INSUFFICIENT_DATA_FILTER_NAME); - } else { - ArrayList possibleMomGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), mom); - ArrayList possibleDadGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), dad); - ArrayList possibleChildGenotypes = createAllThreeGenotypes(vc.getReference(), vc.getAlternateAllele(0), child); + if (mother.isCalled() && father.isCalled() && child.isCalled()) { + ArrayList possibleMotherGenotypes = createAllThreeGenotypes(ref, alt, mother); + ArrayList possibleFatherGenotypes = createAllThreeGenotypes(ref, alt, father); + ArrayList possibleChildGenotypes = createAllThreeGenotypes(ref, alt, child); double bestConfigurationLikelihood = 0.0; double bestPrior = 0.0; - Genotype bestMomGenotype = mom; - Genotype bestDadGenotype = dad; + Genotype bestMotherGenotype = mother; + Genotype bestFatherGenotype = father; Genotype bestChildGenotype = child; double norm = 0.0; - for (Genotype momGenotype : possibleMomGenotypes) { - for (Genotype dadGenotype : possibleDadGenotypes) { + for (Genotype motherGenotype : possibleMotherGenotypes) { + for (Genotype fatherGenotype : possibleFatherGenotypes) { for (Genotype childGenotype : possibleChildGenotypes) { - double prior = isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), momGenotype, dadGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; - double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(momGenotype, dadGenotype, childGenotype); + double prior = isMendelianViolation(ref, alt, motherGenotype, fatherGenotype, childGenotype) ? MENDELIAN_VIOLATION_PRIOR : 1.0 - 12*MENDELIAN_VIOLATION_PRIOR; + double configurationLikelihood = computeTransmissionLikelihoodOfGenotypeConfiguration(motherGenotype, fatherGenotype, childGenotype); norm += prior*configurationLikelihood; if (prior*configurationLikelihood > bestPrior*bestConfigurationLikelihood) { bestConfigurationLikelihood = configurationLikelihood; bestPrior = prior; - bestMomGenotype = momGenotype; - bestDadGenotype = dadGenotype; + bestMotherGenotype = motherGenotype; + bestFatherGenotype = fatherGenotype; bestChildGenotype = childGenotype; } } } } - if (isMendelianViolation(vc.getReference(), vc.getAlternateAllele(0), bestMomGenotype, bestDadGenotype, bestChildGenotype)) { - filters.add(MENDELIAN_VIOLATION_FILTER_NAME); - } else if (bestMomGenotype.isHet() && bestDadGenotype.isHet() && bestChildGenotype.isHet()) { - filters.add(AMBIGUOUS_ALLELE_ORIGIN_FILTER_NAME); - } else { - finalGenotypes = getPhasedGenotypes(bestMomGenotype, bestDadGenotype, bestChildGenotype); - + if (!(bestMotherGenotype.isHet() && bestFatherGenotype.isHet() && bestChildGenotype.isHet())) { + Map attributes = new HashMap(); + attributes.putAll(bestChildGenotype.getAttributes()); attributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, bestPrior*bestConfigurationLikelihood / norm); + bestChildGenotype = Genotype.modifyAttributes(bestChildGenotype, attributes); + + finalGenotypes = getPhasedGenotypes(bestMotherGenotype, bestFatherGenotype, bestChildGenotype); } } - return new VariantContext(SOURCE_NAME, vc.getChr(), vc.getStart(), vc.getStart(), vc.getAlleles(), finalGenotypes, vc.getNegLog10PError(), noFilters ? vc.getFilters() : filters, attributes); + return finalGenotypes; } /** @@ -289,7 +292,27 @@ public class PhaseByTransmission extends RodWalker { Collection vcs = tracker.getVariantContexts(ref, ROD_NAME, null, context.getLocation(), true, true); for (VariantContext vc : vcs) { - vcfWriter.add(phaseTrioGenotypes(vc), ref.getBase()); + Map genotypeMap = vc.getGenotypes(); + + for (Trio trio : trios) { + Genotype mother = vc.getGenotype(trio.getMother()); + Genotype father = vc.getGenotype(trio.getFather()); + Genotype child = vc.getGenotype(trio.getChild()); + + ArrayList trioGenotypes = phaseTrioGenotypes(vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), mother, father, child); + + Genotype phasedMother = trioGenotypes.get(0); + Genotype phasedFather = trioGenotypes.get(1); + Genotype phasedChild = trioGenotypes.get(2); + + genotypeMap.put(phasedMother.getSampleName(), phasedMother); + genotypeMap.put(phasedFather.getSampleName(), phasedFather); + genotypeMap.put(phasedChild.getSampleName(), phasedChild); + } + + VariantContext newvc = VariantContext.modifyGenotypes(vc, genotypeMap); + + vcfWriter.add(newvc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java index 2851ace0d..c10eaa2da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/WriteVCF.java @@ -25,20 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.phasing; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; public class WriteVCF { public static void writeVCF(VariantContext vc, VCFWriter writer, Logger logger) { - byte refBase; - if (!vc.isIndel()) { - Allele refAllele = vc.getReference(); - refBase = SNPallelePair.getSingleBase(refAllele); - } - else { - refBase = vc.getReferenceBaseForIndel(); - } - - writer.add(vc, refBase); + writer.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java index df89efe6d..26fa9a258 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java @@ -40,7 +40,6 @@ import java.util.List; * of paired reads. * * @author mhanna - * @version 0.1 */ public class CountPairsWalker extends ReadPairWalker { @Output diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 8c6539f8d..775cde1f4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -58,6 +58,8 @@ import java.util.List; import java.util.Map; /** + * First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). + * * This walker is designed to work as the first pass in a two-pass processing step. * It does a by-locus traversal operating only at sites that are not in dbSNP. * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality. @@ -72,7 +74,6 @@ import java.util.Map; * * @author rpoplin * @since Nov 3, 2009 - * @help.summary First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 0277fda0d..fec7ee4e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -54,8 +54,10 @@ import java.util.ResourceBundle; import java.util.regex.Pattern; /** + * Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. + * * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. - + * * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. * This walker then outputs a new bam file with these updated (recalibrated) reads. @@ -65,7 +67,6 @@ import java.util.regex.Pattern; * * @author rpoplin * @since Nov 3, 2009 - * @help.summary Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate. */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java deleted file mode 100755 index b3b63bb96..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/CreateSequenomMask.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.sequenom; - -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; - -/** - * Create a mask for use with the PickSequenomProbes walker. - */ -public class CreateSequenomMask extends RodWalker { - @Output - PrintStream out; - - public void initialize() {} - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return 0; - - int result = 0; - for ( VariantContext vc : tracker.getAllVariantContexts(ref) ) { - if ( vc.isSNP() ) { - GenomeLoc loc = context.getLocation(); - out.println(loc.getContig() + "\t" + (loc.getStart()-1) + "\t" + loc.getStop()); - result = 1; - break; - } - } - - return result; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer sum) { - logger.info("Found " + sum + " masking sites."); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java deleted file mode 100755 index b877ff70b..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.sequenom; - -import org.broad.tribble.bed.BEDCodec; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - - -/** - * Generates Sequenom probe information given a single variant track. Emitted is the variant - * along with the 200 reference bases on each side of the variant. - */ -@WalkerName("PickSequenomProbes") -@Requires(value={DataSource.REFERENCE}) -@Reference(window=@Window(start=-200,stop=200)) -public class PickSequenomProbes extends RodWalker { - @Output - PrintStream out; - - @Argument(required=false, shortName="snp_mask", doc="positions to be masked with N's") - protected String SNP_MASK = null; - @Argument(required=false, shortName="project_id",doc="If specified, all probenames will be prepended with 'project_id|'") - String project_id = null; - @Argument(required = false, shortName="omitWindow", doc = "If specified, the window appender will be omitted from the design files (e.g. \"_chr:start-stop\")") - boolean omitWindow = false; - @Argument(required = false, fullName="usePlinkRODNamingConvention", shortName="nameConvention",doc="Use the naming convention defined in PLINKROD") - boolean useNamingConvention = false; - @Argument(required = false, fullName="noMaskWindow",shortName="nmw",doc="Do not mask bases within X bases of an event when designing probes") - int noMaskWindow = 0; - @Argument(required = false, shortName="counter", doc = "If specified, unique count id (ordinal number) is added to the end of each assay name") - boolean addCounter = false; - - private byte [] maskFlags = new byte[401]; - - private LocationAwareSeekableRODIterator snpMaskIterator=null; - - private GenomeLoc positionOfLastVariant = null; - - private int cnt = 0; - private int discarded = 0; - - VariantCollection VCs ; // will keep a set of distinct variants at a given site - private List processedVariantsInScope = new LinkedList(); - - public void initialize() { - if ( SNP_MASK != null ) { - logger.info("Loading SNP mask... "); - ReferenceOrderedData snp_mask; - //if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); - RMDTrack track = builder.createInstanceOfTrack(BEDCodec.class, new File(SNP_MASK)); - snpMaskIterator = new SeekableRODIterator(track.getHeader(), - track.getSequenceDictionary(), - getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(), - getToolkit().getGenomeLocParser(), - track.getIterator()); - //} else { - // // TODO: fix me when Plink is back - // throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)"); - //} - - } - VCs = new VariantCollection(); - } - - - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return ""; - - logger.debug("Probing " + ref.getLocus() + " " + ref.getWindow()); - - VCs.clear(); - VCs.addAll( tracker.getAllVariantContexts(ref), ref.getLocus() ); - - discarded += VCs.discarded(); - - if ( VCs.size() == 0 ) { - logger.debug(" Context empty"); - return ""; - } - - if ( VCs.size() > 1 ) { - logger.debug(" "+VCs.size()+ " variants at the locus"); - } - -// System.out.print("At locus "+ref.getLocus()+": "); -// for ( VariantContext vc : VCs ) { -// System.out.println(vc.toString()); -// } - - // little optimization: since we may have few events at the current site on the reference, - // we are going to make sure we compute the mask and ref bases only once for each location and only if we need to - boolean haveMaskForWindow = false; - boolean haveBasesForWindow = false; - String leading_bases = null; - String trailing_bases = null; - - StringBuilder assaysForLocus = new StringBuilder(""); // all assays for current locus will be collected here (will be multi-line if multiple events are assayed) - - // get all variant contexts!!!! - for ( VariantContext vc : VCs ) { - - // we can only deal with biallelic sites for now - if ( !vc.isBiallelic() ) { - logger.debug(" Not biallelic; skipped"); - continue; - } - - // we don't want to see the same multi-base event (deletion, DNP etc) multiple times. - // All the vcs we are currently seeing are clearly on the same contig as the current reference - // poisiton (or we would not see them at all!). All we need to check is if the vc starts at the - // current reference position (i.e. it is the first time we see it) or not (i.e. we saw it already). - if ( ref.getLocus().getStart() != vc.getStart() ) - continue; - - if ( ! haveMaskForWindow ) { - String contig = context.getLocation().getContig(); - int offset = context.getLocation().getStart(); - int true_offset = offset - 200; - - // we have variant; let's load all the snps falling into the current window and prepare the mask array. - // we need to do it only once per window, regardless of how many vcs we may have at this location! - if ( snpMaskIterator != null ) { - // clear the mask - for ( int i = 0 ; i < 401; i++ ) - maskFlags[i] = 0; - - RODRecordList snpList = snpMaskIterator.seekForward(getToolkit().getGenomeLocParser().createGenomeLoc(contig,offset-200,offset+200)); - if ( snpList != null && snpList.size() != 0 ) { - Iterator snpsInWindow = snpList.iterator(); - int i = 0; - while ( snpsInWindow.hasNext() ) { - GenomeLoc snp = snpsInWindow.next().getLocation(); - // we don't really want to mask out multi-base indels - if ( snp.size() > 1 ) - continue; - logger.debug(" SNP at "+snp.getStart()); - int offsetInWindow = (int)(snp.getStart() - true_offset); - maskFlags[offsetInWindow] = 1; - } - } - } - haveMaskForWindow = true; // if we use masking, we will probably need to recompute the window... - } - - if ( ! haveBasesForWindow ) { - byte[] context_bases = ref.getBases(); - for (int i = 0; i < 401; i++) { - if ( maskFlags[i] == 1 && ( i < 200 - noMaskWindow || i > 200 + getNoMaskWindowRightEnd(vc,noMaskWindow) ) ) { - context_bases[i] = 'N'; - } - } - leading_bases = new String(Arrays.copyOfRange(context_bases, 0, 200)); - trailing_bases = new String(Arrays.copyOfRange(context_bases, 201, 401)); - // masked bases are not gonna change for the current window, unless we use windowed masking; - // in the latter case the bases (N's) will depend on the event we are currently looking at, - // so we better recompute.. - if ( noMaskWindow == 0 ) haveBasesForWindow = true; - } - - - // below, build single assay line for the current VC: - - String assay_sequence; - if ( vc.isSNP() ) - assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; - else if ( vc.isMNP() ) - assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1); - else if ( vc.isInsertion() ) - assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; - else if ( vc.isDeletion() ) - assay_sequence = leading_bases + (char)ref.getBase() + "[" + new String(vc.getReference().getBases()) + "/-]" + trailing_bases.substring(vc.getReference().length()); - else - continue; - - StringBuilder assay_id = new StringBuilder(); - if ( project_id != null ) { - assay_id.append(project_id); - assay_id.append('|'); - } - if ( useNamingConvention ) { - assay_id.append('c'); - assay_id.append(context.getLocation().toString().replace(":","_p")); - } else { - assay_id.append(context.getLocation().toString().replace(':','_')); - } - if ( vc.isInsertion() ) assay_id.append("_gI"); - else if ( vc.isDeletion()) assay_id.append("_gD"); - - if ( ! omitWindow ) { - assay_id.append("_"); - assay_id.append(ref.getWindow().toString().replace(':', '_')); - } - ++cnt; - if ( addCounter ) assay_id.append("_"+cnt); - - assaysForLocus.append(assay_id); - assaysForLocus.append('\t'); - assaysForLocus.append(assay_sequence); - assaysForLocus.append('\n'); - } - return assaysForLocus.toString(); - } - - public String reduceInit() { - return ""; - } - - public String reduce(String data, String sum) { - out.print(data); - return ""; - } - - private int getNoMaskWindowRightEnd(VariantContext vc, int window) { - if ( window == 0 ) { - return 0; - } - - if ( vc.isInsertion() ) { - return window-1; - } - - int max = 0; - for (Allele a : vc.getAlleles() ) { - if ( vc.isInsertion() ) { - logger.debug("Getting length of allele "+a.toString()+" it is "+a.getBases().length+" (ref allele is "+vc.getReference().toString()+")"); - } - if ( a.getBases().length > max ) { - max = a.getBases().length; - } - } - return max+window-1; - } - - public void onTraversalDone(String sum) { - logger.info(cnt+" assay seqences generated"); - logger.info(discarded+" events were found to be duplicates and discarded (no redundant assays generated)"); - } - - static class EventComparator implements Comparator { - - public int compare(VariantContext o1, VariantContext o2) { - // if variants start at different positions, they are different. All we actually - // care about is detecting the variants that are strictly the same; the actual ordering of distinct variants - // (which one we deem less and which one greater) is utterly unimportant. We just need to be consistent. - if ( o1.getStart() < o2.getStart() ) return -1; - if ( o1.getStart() > o2.getStart() ) return 1; - - if ( o1.getType() != o2.getType() ) return o1.getType().compareTo(o2.getType()); - - int refComp = o1.getReference().compareTo(o2.getReference()); - if ( refComp != 0 ) return refComp; - - return o1.getAlternateAllele(0).compareTo(o2.getAlternateAllele(0)); - - } - } - - static class VariantCollection implements Iterable { - TreeSet variants = new TreeSet(new EventComparator()); - int discarded = 0; - - public void add(VariantContext vc, GenomeLoc current) { - if ( vc.getStart() != current.getStart() ) return; // we add only variants that start at current locus - // note that we do not check chr here, since the way this class is used, the mathod is always called with - // VCs coming from the same metadata tracker, so they simply can not be on different chrs! - if ( !vc.isBiallelic() ) { - logger.info(" Non-biallelic variant encountered; skipped"); - return; - } - if ( variants.add(vc) == false ) discarded++; - } - - public void addAll(Collection c, GenomeLoc current) { - for ( VariantContext vc : c ) add(vc,current); - } - - public void clear() { - variants.clear(); - discarded = 0; - } - - public int discarded() { return discarded; } - - public int size() { return variants.size(); } - - public Iterator iterator() { return variants.iterator(); } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java new file mode 100755 index 000000000..cb03d4c61 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -0,0 +1,414 @@ +package org.broadinstitute.sting.gatk.walkers.validation; + +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; +import org.broadinstitute.sting.alignment.bwa.BWTFiles; +import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.RMD; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/13/11 + * Time: 2:12 PM + * To change this template use File | Settings | File Templates. + */ +@Requires(value={DataSource.REFERENCE}, referenceMetaData={@RMD(name="ProbeIntervals",type=TableFeature.class), +@RMD(name="ValidateAlleles",type=VariantContext.class),@RMD(name="MaskAlleles",type=VariantContext.class)}) +public class ValidationAmplicons extends RodWalker { + + @Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false) + boolean lowerCaseSNPs = false; + + @Argument(doc="Size of the virtual primer to use for lower-casing regions with low specificity",fullName="virtualPrimerSize",required=false) + int virtualPrimerSize = 20; + + @Argument(doc="Monomorphic sites in the mask file will be treated as filtered",fullName="filterMonomorphic",required=false) + boolean filterMonomorphic = false; + + @Argument(doc="Do not use BWA, lower-case repeats only",fullName="doNotUseBWA",required=false) + boolean doNotUseBWA = false; + + GenomeLoc prevInterval; + GenomeLoc allelePos; + String probeName; + StringBuilder sequence; + StringBuilder rawSequence; + boolean sequenceInvalid; + List invReason; + int indelCounter; + + @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " + + "generated by bwa index -d bwtsw. If unspecified, will default " + + "to the reference specified via the -R argument.",required=false) + private File targetReferenceFile = null; + + @Output + PrintStream out; + + BWACAligner aligner = null; + + private SAMFileHeader header = null; + + public void initialize() { + if ( ! doNotUseBWA ) { + if(targetReferenceFile == null) + targetReferenceFile = getToolkit().getArguments().referenceFile; + BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath()); + BWAConfiguration configuration = new BWAConfiguration(); + aligner = new BWACAligner(bwtFiles,configuration); + header = new SAMFileHeader(); + SAMSequenceDictionary referenceDictionary = + ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary(); + header.setSequenceDictionary(referenceDictionary); + header.setSortOrder(SAMFileHeader.SortOrder.unsorted); + } + } + + public Integer reduceInit() { + prevInterval = null; + sequence = null; + rawSequence = null; + sequenceInvalid = false; + probeName = null; + invReason = null; + indelCounter = 0; + return 0; + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || ! tracker.hasROD("ProbeIntervals")) { return null; } + + GenomeLoc interval = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getLocation(); + //logger.debug(interval); + if ( prevInterval == null || ! interval.equals(prevInterval) ) { + // we're in a new interval, we should: + // 1) print out previous data + // 2) reset internal data + // 3) instantiate traversal of this interval + + // step 1: + if ( prevInterval != null ) { + // there was a previous interval + validateSequence(); // ensure the sequence in the region is valid + // next line removed in favor of the one after + if ( doNotUseBWA ) { + lowerRepeats(); // change repeats in sequence to lower case + } else { + lowerNonUniqueSegments(); + } + print(); // print out the fasta sequence + } + + // step 2: + prevInterval = interval; + allelePos = null; + sequence = new StringBuilder(); + rawSequence = new StringBuilder(); + sequenceInvalid = false; + invReason = new LinkedList(); + logger.debug(Utils.join("\t",((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getAllValues())); + probeName = ((TableFeature) tracker.getReferenceMetaData("ProbeIntervals",true).get(0)).getValue(1); + indelCounter = 0; + } + + // step 3 (or 1 if not new): + // build up the sequence + + VariantContext mask = tracker.getVariantContext(ref,"MaskAlleles",ref.getLocus()); + VariantContext validate = tracker.getVariantContext(ref,"ValidateAlleles",ref.getLocus()); + + if ( mask == null && validate == null ) { + if ( indelCounter > 0 ) { + sequence.append('N'); + indelCounter--; + } else { + sequence.append(Character.toUpperCase((char) ref.getBase())); + } + rawSequence.append(Character.toUpperCase((char) ref.getBase())); + } else if ( validate != null ) { + // doesn't matter if there's a mask here too -- this is what we want to validate + if ( validate.isFiltered() ) { + logger.warn("You are attempting to validate a filtered site. Why are you attempting to validate a filtered site? You should not be attempting to validate a filtered site."); + sequenceInvalid = true; + invReason.add("SITE_IS_FILTERED"); + } + if ( validate.isIndel() ) { + sequence.append(Character.toUpperCase((char)ref.getBase())); + rawSequence.append(Character.toUpperCase((char)ref.getBase())); + } + sequence.append('['); + sequence.append(validate.getAlternateAllele(0).toString()); + sequence.append('/'); + sequence.append(validate.getReference().toString()); + sequence.append(']'); + // do this to the raw sequence to -- the indeces will line up that way + rawSequence.append('['); + rawSequence.append(validate.getAlternateAllele(0).getBaseString()); + rawSequence.append('/'); + rawSequence.append(validate.getReference().getBaseString()); + rawSequence.append(']'); + allelePos = ref.getLocus(); + if ( indelCounter > 0 ) { + logger.warn("An indel event overlaps the event to be validated. This completely invalidates the probe."); + sequenceInvalid = true; + invReason.add("INDEL_OVERLAPS_VALIDATION_SITE"); + if ( validate.isSNP() ) { + indelCounter--; + } else { + indelCounter -= validate.getEnd()-validate.getStart(); + } + } + } else /* (mask != null && validate == null ) */ { + if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) { + logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed."); + logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles()))); + sequenceInvalid = true; + invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION"); + // note: indelCounter could be > 0 (could have small deletion within larger one). This always selects + // the larger event. + int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart(); + if ( indelCounterNew > indelCounter ) { + indelCounter = indelCounterNew; + } + //sequence.append((char) ref.getBase()); + //sequence.append(mask.isInsertion() ? 'I' : 'D'); + sequence.append("N"); + indelCounter--; + rawSequence.append(Character.toUpperCase((char) ref.getBase())); + } else if ( indelCounter > 0 ) { + // previous section resets the indel counter. Doesn't matter if there's a SNP underlying this, we just want to append an 'N' and move on. + sequence.append('N'); + indelCounter--; + rawSequence.append(Character.toUpperCase((char)ref.getBase())); + } else if ( ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )){ + logger.debug("SNP in mask found at " + ref.getLocus().toString()); + + if ( lowerCaseSNPs ) { + sequence.append(Character.toLowerCase((char) ref.getBase())); + } else { + sequence.append((char) BaseUtils.N); + } + + rawSequence.append(Character.toUpperCase((char) ref.getBase())); + } else if ( mask.isSNP() ) { + logger.debug("SNP in mask found at "+ref.getLocus().toString()+" but was either filtered or monomorphic"); + sequence.append((Character.toUpperCase((char) ref.getBase()))); + rawSequence.append(Character.toUpperCase((char) ref.getBase())); + } + } + + return 1; + } + + public Integer reduce(Integer i, Integer j) { + return 0; + } + + public void onTraversalDone(Integer fin ) { + validateSequence(); + if ( doNotUseBWA ) { + lowerRepeats(); + } else { + lowerNonUniqueSegments(); + aligner.close(); + } + print(); + } + + public void validateSequence() { + // code for ensuring primer sequence is valid goes here + + // validate that there are no masked sites near to the variant site + String seq = sequence.toString(); + int start = seq.indexOf('[') - 4; + int end = seq.indexOf(']') + 5; + + if ( start < 50 ) { + logger.warn("There is not enough sequence before the start position of the probed allele for adequate probe design. This site will not be designed."); + sequenceInvalid = true; + invReason.add("START_TOO_CLOSE"); + } else if ( end > seq.length() - 50 ) { + logger.warn("There is not enough sequence after the end position of the probed allele fore adequate probe design. This site will not be desinged. "); + sequenceInvalid = true; + invReason.add("END_TOO_CLOSE"); + } else { + boolean maskNearVariantSite = false; + for ( int i = start; i < end; i++ ) { + maskNearVariantSite |= (seq.charAt(i) == 'N' || Character.isLowerCase(seq.charAt(i))); + } + + if ( maskNearVariantSite ) { + logger.warn("There is one (or more) mask variants within 4 basepair of the variant given for validation. This site will not be designed."); + sequenceInvalid = true; + invReason.add("VARIANT_TOO_NEAR_PROBE"); + } + } + + if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { + logger.warn("Multiple probe variants were found within this interval. Please fix the definitions of the intervals so they do not overlap."); + sequenceInvalid = true; + invReason.add("MULTIPLE_PROBES"); + } + + if ( seq.indexOf("[") < 0 ) { + logger.warn("No variants in region were found. This site will not be designed."); + sequenceInvalid = true; + invReason.add("NO_VARIANTS_FOUND"); + } + } + + public void lowerNonUniqueSegments() { + if ( ! invReason.contains("MULTIPLE_PROBES") && !invReason.contains("NO_VARIANTS_FOUND") ) { + String leftFlank = rawSequence.toString().split("\\[")[0]; + String rightFlank = rawSequence.toString().split("\\]")[1]; + List badLeft = getBadIndeces(leftFlank); + List badRight = getBadIndeces(rightFlank); + // propagate lowercases into the printed sequence + for ( int idx = 0; idx < leftFlank.length(); idx++ ) { + while ( badLeft.size() > 0 && idx > badLeft.get(0) + virtualPrimerSize ) { + badLeft.remove(0); + } + + if ( badLeft.size() > 0 && badLeft.get(0) <= idx && idx <= badLeft.get(0) + virtualPrimerSize ) { + sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx))); + } + } + + int offset = 1 + rawSequence.indexOf("]"); + for ( int i= 0; i < rightFlank.length(); i++ ) { + int idx = i + offset; + while ( badRight.size() > 0 && i > badRight.get(0) + virtualPrimerSize ) { + //logger.debug("Removing "+badRight.get(0)+" because "+(badRight.get(0)+virtualPrimerSize)+" < "+i); + badRight.remove(0); + } + + if ( badRight.size() > 0 && badRight.get(0) <= i && i <= badRight.get(0) + virtualPrimerSize ) { + //logger.debug("Resetting character on right flank: "+idx+" "+i+" offset="+offset); + //logger.debug(sequence); + sequence.setCharAt(idx,Character.toLowerCase(sequence.charAt(idx))); + //logger.debug(sequence); + } + } + } + } + + private List getBadIndeces(String sequence) { + + List badLeftIndeces = new ArrayList(sequence.length()-virtualPrimerSize); + for ( int i = 0; i < sequence.length()-virtualPrimerSize ; i++ ) { + String toAlign = sequence.substring(i,i+virtualPrimerSize); + Iterable allAlignments = aligner.getAllAlignments(toAlign.getBytes()); + for ( Alignment[] alignments : allAlignments ) { + if ( alignments.length > 1 ) { + if ( alignments[0].getMappingQuality() == 0 ) { + // this region is bad -- multiple MQ alignments + badLeftIndeces.add(i); + } + } + } + } + + return badLeftIndeces; + } + + + /** + * Note- this is an old function - a proxy for identifying regions with low specificity to genome. Saved in case the alignment-based version + * turns out to be worse than just doing a simple repeat-lowering method. + */ + public void lowerRepeats() { + // convert to lower case low-complexity repeats, e.g. tandem k-mers + final int K_LIM = 8; + String seq = sequence.toString(); + StringBuilder newSequence = new StringBuilder(); + int start_pos = 0; + while( start_pos < seq.length() ) { + boolean broke = false; + for ( int length = K_LIM; length > 1; length -- ) { + //logger.debug(String.format("start1: %d end1: %d start2: %d end2: %d str: %d",start_pos,start_pos+length,start_pos+length,start_pos+2*length,seq.length())); + if ( start_pos + 2*length> seq.length() ) { + continue; + } + if ( equalsIgnoreNs(seq.substring(start_pos,start_pos+length),seq.substring(start_pos+length,start_pos+2*length)) ) { + newSequence.append(seq.substring(start_pos,start_pos+length).toLowerCase()); + newSequence.append(seq.substring(start_pos+length,start_pos+2*length).toLowerCase()); + start_pos += 2*length; + broke = true; + break; + } + } + + if ( ! broke ) { + newSequence.append(seq.substring(start_pos,start_pos+1)); + start_pos++; + } + + } + + if ( seq.indexOf("[") != seq.lastIndexOf("[") ) { + return; + } + + sequence = newSequence; + } + + public boolean equalsIgnoreNs(String one, String two) { + if ( one.length() != two.length() ) { return false; } + for ( int idx = 0; idx < one.length(); idx++ ) { + if ( Character.toUpperCase(one.charAt(idx)) != Character.toUpperCase(two.charAt(idx)) ) { + if ( Character.toUpperCase(one.charAt(idx)) != 'N' && Character.toUpperCase(two.charAt(idx)) != 'N' ) { + return false; + } + } + } + + //logger.debug(String.format("one: %s two: %s",one,two)); + + return true; + } + + public void print() { + String valid; + if ( sequenceInvalid ) { + valid = ""; + while ( invReason.size() > 0 ) { + String reason = invReason.get(0); + invReason.remove(reason); + int num = 1; + while ( invReason.contains(reason) ) { + num++; + invReason.remove(reason); + } + valid += String.format("%s=%d,",reason,num); + } + } else { + valid = "Valid"; + } + + String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index fe3173506..3867aa958 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -67,7 +67,7 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false) protected String[] STRATIFICATIONS_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)") + @Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)", required=false) protected Boolean NO_STANDARD_STRATIFICATIONS = false; @Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false) @@ -77,7 +77,7 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false) protected String[] MODULES_TO_USE = {}; - @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)") + @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)", required=false) protected Boolean NO_STANDARD_MODULES = false; // Other arguments diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java index 3e8a6ed17..e1f2ae983 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java @@ -8,6 +8,18 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.ArrayList; import java.util.Set; +/** + * CpG is a stratification module for VariantEval that divides the input data by within/not within a CpG site + * + *

+ * It is a three-state stratification: + *

    + *
  • The locus is a CpG site ("CpG") + *
  • The locus is not a CpG site ("non_CpG") + *
  • The locus is either a CpG or not a CpG site ("all") + *
+ * A CpG site is defined as a site where the reference base at a locus is a C and the adjacent reference base in the 3' direction is a G. + */ public class CpG extends VariantStratifier { private ArrayList states; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 403c67d3e..33504f96e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -49,8 +49,6 @@ import java.util.*; * * @author rpoplin * @since Mar 14, 2011 - * - * @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration */ public class ApplyRecalibration extends RodWalker { @@ -206,9 +204,9 @@ public class ApplyRecalibration extends RodWalker { filters.add(filterString); vc = VariantContext.modifyFilters(vc, filters); } - vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs), ref.getBase() ); + vcfWriter.add( VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), vc.getFilters(), attrs) ); } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add( vc ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 67d54a408..b7f71c1ff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -240,6 +240,14 @@ public class VariantDataManager { if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + if (vc.isIndel() && annotationKey.equalsIgnoreCase("QD")) { + // normalize QD by event length for indel case + int eventLength = Math.abs(vc.getAlternateAllele(0).getBaseString().length() - vc.getReference().getBaseString().length()); // ignore multi-allelic complication here for now + if (eventLength > 0) // sanity check + value /= (double)eventLength; + + } + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } } catch( Exception e ) { @@ -258,7 +266,7 @@ public class VariantDataManager { datum.consensusCount = 0; for( final TrainingSet trainingSet : trainingSets ) { - for( final VariantContext trainVC : tracker.getVariantContexts( ref, trainingSet.name, null, context.getLocation(), false, false ) ) { + for( final VariantContext trainVC : tracker.getVariantContexts( ref, trainingSet.name, null, context.getLocation(), true, false ) ) { if( trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphic()) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 8179463eb..76c888640 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -53,8 +53,6 @@ import java.util.*; * * User: rpoplin * Date: 3/12/11 - * - * @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score */ public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 837f352f8..57e2746f3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.apache.poi.hpsf.Variant; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; @@ -149,7 +150,7 @@ public class CombineVariants extends RodWalker { // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus - Collection vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false); + Collection vcs = tracker.getAllVariantContexts(ref, null, context.getLocation(), true, false); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); @@ -157,7 +158,7 @@ public class CombineVariants extends RodWalker { if ( ASSUME_IDENTICAL_SAMPLES ) { for ( final VariantContext vc : vcs ) { - vcfWriter.add( vc, ref.getBase() ); + vcfWriter.add(vc); } return vcs.isEmpty() ? 0 : 1; @@ -172,24 +173,32 @@ public class CombineVariants extends RodWalker { if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - VariantContext mergedVC; + List mergedVCs = new ArrayList(); if ( master ) { - mergedVC = VariantContextUtils.masterMerge(vcs, "master"); + mergedVCs.add(VariantContextUtils.masterMerge(vcs, "master")); } else { - mergedVC = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(),vcs, priority, filteredRecordsMergeType, - genotypeMergeOption, true, printComplexMerges, ref.getBase(), SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC); + Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); + // iterate over the types so that it's deterministic + for ( VariantContext.Type type : VariantContext.Type.values() ) { + if ( VCsByType.containsKey(type) ) + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + } } - //out.printf(" merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC); + for ( VariantContext mergedVC : mergedVCs ) { + // only operate at the start of events + if ( mergedVC == null ) + continue; - if ( mergedVC != null ) { // only operate at the start of events HashMap attributes = new HashMap(mergedVC.getAttributes()); // re-compute chromosome counts VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); if ( minimalVCF ) annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); - vcfWriter.add(annotatedMergedVC, ref.getBase()); + vcfWriter.add(annotatedMergedVC); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index b45ee1b67..fc9947e20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -78,7 +78,7 @@ public class FilterLiftedVariants extends RodWalker { if ( failed ) failedLocs++; else - writer.add(vc, ref[0]); + writer.add(vc); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index 2ebd183f4..5ab326418 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -90,10 +90,10 @@ public class LeftAlignVariants extends RodWalker { private int alignAndWrite(VariantContext vc, final ReferenceContext ref) { - if ( vc.isBiallelic() && vc.isIndel() ) + if ( vc.isBiallelic() && vc.isIndel() && !vc.isComplexIndel() ) return writeLeftAlignedIndel(vc, ref); else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } @@ -109,7 +109,7 @@ public class LeftAlignVariants extends RodWalker { indelLength = vc.getAlternateAllele(0).length(); if ( indelLength > 200 ) { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } @@ -137,17 +137,12 @@ public class LeftAlignVariants extends RodWalker { byte[] newBases = new byte[indelLength]; System.arraycopy((vc.isDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength); Allele newAllele = Allele.create(newBases, vc.isDeletion()); - newVC = updateAllele(newVC, newAllele); + newVC = updateAllele(newVC, newAllele, refSeq[indelIndex-1]); - // we need to update the reference base just in case it changed - Map attrs = new HashMap(newVC.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refSeq[indelIndex-1]); - newVC = VariantContext.modifyAttributes(newVC, attrs); - - writer.add(newVC, refSeq[indelIndex-1]); + writer.add(newVC); return 1; } else { - writer.add(vc, ref.getBase()); + writer.add(vc); return 0; } } @@ -173,7 +168,7 @@ public class LeftAlignVariants extends RodWalker { return hap; } - public static VariantContext updateAllele(VariantContext vc, Allele newAllele) { + public static VariantContext updateAllele(VariantContext vc, Allele newAllele, Byte refBaseForIndel) { // create a mapping from original allele to new allele HashMap alleleMap = new HashMap(vc.getAlleles().size()); if ( newAllele.isReference() ) { @@ -197,6 +192,6 @@ public class LeftAlignVariants extends RodWalker { newGenotypes.put(genotype.getKey(), Genotype.modifyAlleles(genotype.getValue(), newAlleles)); } - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes()); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), alleleMap.values(), newGenotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), refBaseForIndel); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 4f05c8aac..b33f4d26a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -125,14 +125,14 @@ public class LiftoverVariants extends RodWalker { vc = VariantContext.modifyAttributes(vc, attrs); } - VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, ref.getBase(), false); + VariantContext newVC = VariantContext.createVariantContextWithPaddedAlleles(vc, false); if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(newVC) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), newVC.getChr(), newVC.getStart(), originalVC.getReference(), originalVC.getAlternateAllele(0), newVC.getReference(), newVC.getAlternateAllele(0))); } - writer.add(vc, ref.getBase()); + writer.add(vc); successfulIntervals++; } else { failedIntervals++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index f0756d884..257bda372 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -101,9 +101,9 @@ public class RandomlySplitVariants extends RodWalker { for ( VariantContext vc : vcs ) { int random = GenomeAnalysisEngine.getRandomGenerator().nextInt(1000); if ( random < iFraction ) - vcfWriter1.add(vc, ref.getBase()); + vcfWriter1.add(vc); else - vcfWriter2.add(vc, ref.getBase()); + vcfWriter2.add(vc); } return 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 1db692e9f..41374a349 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -24,26 +24,30 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; import java.util.*; /** @@ -91,6 +95,13 @@ public class SelectVariants extends RodWalker { @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) private boolean KEEP_AF_SPECTRUM = false; + @Hidden + @Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false) + private File AF_FILE = new File(""); + + @Hidden + @Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + private File FAMILY_STRUCTURE_FILE = null; @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private String FAMILY_STRUCTURE = ""; @@ -113,20 +124,20 @@ public class SelectVariants extends RodWalker { @Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false) private boolean SELECT_INDELS = false; + @Hidden + @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + private String outMVFile = null; /* Private class used to store the intermediate variants in the integer random selection process */ private class RandomVariantStructure { private VariantContext vc; - private byte refBase; - RandomVariantStructure(VariantContext vcP, byte refBaseP) { + RandomVariantStructure(VariantContext vcP) { vc = vcP; - refBase = refBaseP; } - public void set (VariantContext vcP, byte refBaseP) { + public void set (VariantContext vcP) { vc = vcP; - refBase = refBaseP; } } @@ -140,7 +151,7 @@ public class SelectVariants extends RodWalker { private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; - private MendelianViolation mv; + private Set mvSet = new HashSet(); /* default name for the variant dataset (VCF) */ private final String variantRodName = "variant"; @@ -155,8 +166,14 @@ public class SelectVariants extends RodWalker { private RandomVariantStructure [] variantArray; + /* Variables used for random selection with AF boosting */ + private ArrayList afBreakpoints = null; + private ArrayList afBoosts = null; + double bkDelta = 0.0; + private PrintStream outMVFileStream = null; + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -212,10 +229,29 @@ public class SelectVariants extends RodWalker { CONCORDANCE_ONLY = concordanceRodName.length() > 0; if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceRodName); - if (MENDELIAN_VIOLATIONS) - mv = new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD); + if (MENDELIAN_VIOLATIONS) { + if ( FAMILY_STRUCTURE_FILE != null) { + try { + for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) { + MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); + if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) + mvSet.add(mv); + } + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(AF_FILE, e); + } + if (outMVFile != null) + try { + outMVFileStream = new PrintStream(outMVFile); + } + catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); } + } + else + mvSet.add(new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD)); + } else if (!FAMILY_STRUCTURE.isEmpty()) { - mv = new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD); + mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); MENDELIAN_VIOLATIONS = true; } @@ -226,7 +262,34 @@ public class SelectVariants extends RodWalker { } SELECT_RANDOM_FRACTION = fractionRandom > 0; - if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track"); + if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); + + + if (KEEP_AF_SPECTRUM) { + try { + afBreakpoints = new ArrayList(); + afBoosts = new ArrayList(); + logger.info("Reading in AF boost table..."); + boolean firstLine = false; + for ( final String line : new XReadLines( AF_FILE ) ) { + if (!firstLine) { + firstLine = true; + continue; + } + final String[] vals = line.split(" "); + + double bkp = Double.valueOf(vals[0]); + double afb = Double.valueOf(vals[1]); + afBreakpoints.add(bkp); + afBoosts.add(afb); + + } + bkDelta = afBreakpoints.get(0); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(AF_FILE, e); + } + + } } /** @@ -250,9 +313,24 @@ public class SelectVariants extends RodWalker { for (VariantContext vc : vcs) { if (MENDELIAN_VIOLATIONS) { - if (!mv.isViolation(vc)) { - break; + boolean foundMV = false; + for (MendelianViolation mv : mvSet) { + if (mv.isViolation(vc)) { + foundMV = true; + //System.out.println(vc.toString()); + if (outMVFile != null) + outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), + vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getChromosomeCount(vc.getAlternateAllele(0)), + mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), + vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), + vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), + vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() ); + } } + + if (!foundMV) + break; } if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getVariantContexts(ref, discordanceRodName, null, context.getLocation(), true, false); @@ -283,46 +361,59 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } - else if (!SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom) { - vcfWriter.add(sub, ref.getBase()); + else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { + vcfWriter.add(sub); } else { if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { - Collection compVCs = tracker.getVariantContexts(ref, inputAFRodName, null, context.getLocation(), true, false); - if (compVCs.isEmpty()) - return 0; - // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. // We then pick a variant with probablity AF*desiredFraction - for (VariantContext compVC : compVCs) { - if ( compVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { - String afo = compVC.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); + if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { + String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); - double af; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + double af; + double afBoost = 1.0; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - double[] afd = new double[afs.length]; + double[] afd = new double[afs.length]; - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); - } - else - af = Double.valueOf(afo); - - //System.out.format("%s .. %4.4f\n",afo.toString(), af); - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * af) - vcfWriter.add(sub, ref.getBase()); } - break; // do only one vc + else + af = Double.valueOf(afo); + + // now boost af by table read from file if desired + //double bkpt = 0.0; + int bkidx = 0; + if (!afBreakpoints.isEmpty()) { + for ( Double bkpt : afBreakpoints) { + if (af < bkpt + bkDelta) + break; + else bkidx++; + } + if (bkidx >=afBoosts.size()) + bkidx = afBoosts.size()-1; + afBoost = afBoosts.get(bkidx); + //System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost); + + + + } + + //System.out.format("%s .. %4.4f\n",afo.toString(), af); + if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) + vcfWriter.add(sub); } + } } } @@ -406,8 +497,8 @@ public class SelectVariants extends RodWalker { private boolean haveSameGenotypes(Genotype g1, Genotype g2) { if ((g1.isCalled() && g2.isFiltered()) || - (g2.isCalled() && g1.isFiltered()) || - (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) + (g2.isCalled() && g1.isFiltered()) || + (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) return false; List a1s = g1.getAlleles(); @@ -426,7 +517,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { int positionToPrint = positionToAdd; for (int i=0; i { * @param vc the VariantContext record to subset * @param samples the samples to extract * @return the subsetted VariantContext - */ + */ private VariantContext subsetRecord(VariantContext vc, Set samples) { if ( samples == null || samples.isEmpty() ) return vc; @@ -450,7 +541,7 @@ public class SelectVariants extends RodWalker { if ( samples.contains(genotypePair.getKey()) ) genotypes.add(genotypePair.getValue()); } - + VariantContext sub = vc.subContextFromGenotypes(genotypes, vc.getAlleles()); HashMap attributes = new HashMap(sub.getAttributes()); @@ -460,7 +551,7 @@ public class SelectVariants extends RodWalker { Genotype g = sub.getGenotype(sample); if (g.isNotFiltered() && g.isCalled()) { - + String dp = (String) g.getAttribute("DP"); if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) { depth += Integer.valueOf(dp); @@ -489,13 +580,13 @@ public class SelectVariants extends RodWalker { private void randomlyAddVariant(int rank, VariantContext vc, byte refBase) { if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc, refBase); + variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); else { double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); double t = (1.0/(rank-numRandom+1)); if ( v < t) { - variantArray[positionToAdd].set(vc, refBase); + variantArray[positionToAdd].set(vc); nVariantsAdded++; positionToAdd = nextCircularPosition(positionToAdd); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index 0644c669b..0de405d97 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -154,10 +154,10 @@ public class ValidateVariants extends RodWalker { try { switch( type ) { case ALL: - vc.extraStrictValidation(observedRefAllele, rsIDs); + vc.extraStrictValidation(observedRefAllele, ref.getBase(), rsIDs); break; case REF: - vc.validateReferenceBases(observedRefAllele); + vc.validateReferenceBases(observedRefAllele, ref.getBase()); break; case IDS: vc.validateRSIDs(rsIDs); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 86bb3b0e8..ca6533721 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -46,7 +46,7 @@ import java.util.*; */ @Reference(window=@Window(start=0,stop=40)) @Requires(value={},referenceMetaData=@RMD(name=VariantValidationAssessor.INPUT_VARIANT_ROD_BINDING_NAME, type=VariantContext.class)) -public class VariantValidationAssessor extends RodWalker,Integer> { +public class VariantValidationAssessor extends RodWalker { public static final String INPUT_VARIANT_ROD_BINDING_NAME = "variant"; @@ -68,7 +68,7 @@ public class VariantValidationAssessor extends RodWalker sampleNames = null; // variant context records - private ArrayList> records = new ArrayList>(); + private ArrayList records = new ArrayList(); // statistics private int numRecords = 0; @@ -89,7 +89,7 @@ public class VariantValidationAssessor extends RodWalker map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return null; @@ -104,7 +104,7 @@ public class VariantValidationAssessor extends RodWalker call, Integer numVariants) { + public Integer reduce(VariantContext call, Integer numVariants) { if ( call != null ) { numVariants++; records.add(call); @@ -155,12 +155,12 @@ public class VariantValidationAssessor extends RodWalker record : records ) - vcfwriter.add(record.first, record.second); + for ( VariantContext record : records ) + vcfwriter.add(record); } - private Pair addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { + private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) { // check possible filters double hwPvalue = hardyWeinbergCalculation(vContext); @@ -202,9 +202,7 @@ public class VariantValidationAssessor extends RodWalker(vContext, ref.getBase()); + return VariantContext.modifyAttributes(vContext, infoMap); } private double hardyWeinbergCalculation(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 8d90af65a..b2b6d4815 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -24,6 +24,8 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -33,7 +35,6 @@ import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; @@ -74,17 +75,29 @@ public class VariantsToTable extends RodWalker { // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } }); getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } }); - getters.put("REF", new Getter() { public String get(VariantContext vc) { return vc.getReference().toString(); } }); + getters.put("REF", new Getter() { + public String get(VariantContext vc) { + String x = ""; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x=x+new String(new byte[]{refByte}); + } + return x+vc.getReference().getDisplayString(); + } + }); getters.put("ALT", new Getter() { public String get(VariantContext vc) { StringBuilder x = new StringBuilder(); int n = vc.getAlternateAlleles().size(); - if ( n == 0 ) return "."; + if ( vc.hasReferenceBaseForIndel() ) { + Byte refByte = vc.getReferenceBaseForIndel(); + x.append(new String(new byte[]{refByte})); + } for ( int i = 0; i < n; i++ ) { if ( i != 0 ) x.append(","); - x.append(vc.getAlternateAllele(i).toString()); + x.append(vc.getAlternateAllele(i).getDisplayString()); } return x.toString(); } @@ -168,6 +181,31 @@ public class VariantsToTable extends RodWalker { throw new UserException(String.format("Missing field %s in vc %s at %s", field, vc.getSource(), vc)); } + if (field.equals("AF") || field.equals("AC")) { + String afo = val; + + double af=0; + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + double[] afd = new double[afs.length]; + + for (int k=0; k < afd.length; k++) + afd[k] = Double.valueOf(afs[k]); + + af = MathUtils.arrayMax(afd); + //af = Double.valueOf(afs[0]); + + } + else + if (!afo.equals("NA")) + af = Double.valueOf(afo); + + val = Double.toString(af); + + } vals.add(val); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index aa0e5987f..2afa315ff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -67,6 +67,9 @@ public class VariantsToVCF extends RodWalker { @Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false) protected String sampleName = null; + @Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false) + protected boolean fixReferenceBase = false; + private Set allowedGenotypeFormatStrings = new HashSet(); private boolean wroteHeader = false; @@ -104,6 +107,10 @@ public class VariantsToVCF extends RodWalker { vc = VariantContext.modifyGenotypes(vc, genotypes); } + // todo - fix me. This may not be the cleanest way to handle features what need correct indel padding + if (fixReferenceBase) { + vc = new VariantContext("Variant",vc.getChr(),vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.getGenotypes(), vc.getNegLog10PError(), vc.getFilters(),vc.getAttributes(), ref.getBase()); + } writeRecord(vc, tracker, ref.getBase()); } @@ -149,9 +156,10 @@ public class VariantsToVCF extends RodWalker { VariantContext vc = VariantContextAdaptors.toVariantContext(INPUT_ROD_NAME, hapmap, ref); if ( vc != null ) { if ( refBase != null ) { - Map attrs = new HashMap(vc.getAttributes()); - attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); - vc = VariantContext.modifyAttributes(vc, attrs); + // TODO -- fix me + //Map attrs = new HashMap(vc.getAttributes()); + //attrs.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, refBase); + //vc = VariantContext.modifyAttributes(vc, attrs); } hapmapVCs.add(vc); } @@ -233,7 +241,7 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vcfwriter.add(vc, ref); + vcfwriter.add(vc); } public Integer reduceInit() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcid.java index 0d0b906e0..0b47093fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcid.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcid.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; /** * Represents a single amino acid. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java rename to public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java index c10eb5dd7..1ae28ffb3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/AminoAcidTable.java +++ b/public/java/src/org/broadinstitute/sting/utils/AminoAcidTable.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; +package org.broadinstitute.sting.utils; import java.util.HashMap; diff --git a/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java b/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java new file mode 100644 index 000000000..619beddb8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/ContigComparator.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils; + +import java.util.Comparator; +import java.util.Set; +import java.util.TreeSet; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/23/11 + * Time: 6:07 PM + * + * Contig comparator -- sorting contigs like Picard + * + * This is very useful if you want to output your text files or manipulate data in the usual chromosome ordering : + * 1 + * 2 + * 3 + * ... + * 21 + * 22 + * X + * Y + * GL*** + * ... + * Just use this comparator in any SortedSet class constructor and your data will be sorted like in the BAM file. + */ +public class ContigComparator implements Comparator { + private Set specialChrs; + + public ContigComparator() { + specialChrs = new TreeSet(); + specialChrs.add("X"); + specialChrs.add("Y"); + } + + public int compare(String chr1, String chr2) { + if (chr1.equals(chr2)) + return 0; + + Integer x = convertStringWithoutException(chr1); + Integer y = convertStringWithoutException(chr2); + // both contigs are numbered + if (x != null && y != null) + return (x < y) ? -1:1; + + // both contigs are named + if (x == null && y == null) { + // both contigs are special contigs or neither contig is a special contigs + if (specialChrs.contains(chr1) && specialChrs.contains(chr2) || (!specialChrs.contains(chr1) && !specialChrs.contains(chr2))) + return chr1.compareTo(chr2); + // one contig is a special and the other is not special + if (specialChrs.contains(chr1)) + return -1; + return 1; + } + + // one contig is named the other is numbered + if (x != null) + return -1; + return 1; + } + + private Integer convertStringWithoutException(String contig) { + Integer x = null; + try { + x = Integer.decode(contig); + } catch (NumberFormatException n){} + return x; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 6a50badce..015e5d6f6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -42,6 +42,21 @@ public class Utils { /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(Utils.class); + public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + + /** + * Calculates the optimum initial size for a hash table given the maximum number + * of elements it will need to hold. The optimum size is the smallest size that + * is guaranteed not to result in any rehash/table-resize operations. + * + * @param maxElements The maximum number of elements you expect the hash table + * will need to hold + * @return The optimum initial size for the table, given maxElements + */ + public static int optimumHashSize ( int maxElements ) { + return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2; + } + public static String getClassName(Class c) { String FQClassName = c.getName(); int firstChar; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 710127f7a..9788f8654 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -567,7 +567,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // set the reference base for indels in the attributes Map attributes = new TreeMap(inputVC.getAttributes()); - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(inputVC.getReference().getBases()[0])); Map originalToTrimmedAlleleMap = new HashMap(); @@ -611,7 +610,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles)); } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes); + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0])); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java index 311aaecf7..c299511db 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/SortingVCFWriterBase.java @@ -105,9 +105,8 @@ public abstract class SortingVCFWriterBase implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base */ - public void add(VariantContext vc, byte refBase) { + public void add(VariantContext vc) { /* Note that the code below does not prevent the successive add()-ing of: (chr1, 10), (chr20, 200), (chr15, 100) since there is no implicit ordering of chromosomes: */ @@ -122,7 +121,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { noteCurrentRecord(vc); // possibly overwritten - queue.add(new VCFRecord(vc, refBase)); + queue.add(new VCFRecord(vc)); emitSafeRecords(); } @@ -133,7 +132,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { // No need to wait, waiting for nothing, or before what we're waiting for: if (emitUnsafe || mostUpstreamWritableLoc == null || firstRec.vc.getStart() <= mostUpstreamWritableLoc) { queue.poll(); - innerWriter.add(firstRec.vc, firstRec.refBase); + innerWriter.add(firstRec.vc); } else { break; @@ -143,7 +142,7 @@ public abstract class SortingVCFWriterBase implements VCFWriter { /** * Gets a string representation of this object. - * @return + * @return a string representation of this object */ @Override public String toString() { @@ -158,11 +157,9 @@ public abstract class SortingVCFWriterBase implements VCFWriter { private static class VCFRecord { public VariantContext vc; - public byte refBase; - public VCFRecord(VariantContext vc, byte refBase) { + public VCFRecord(VariantContext vc) { this.vc = vc; - this.refBase = refBase; } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index b7f4be39a..d3705813c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -202,20 +202,18 @@ public class StandardVCFWriter implements VCFWriter { * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels */ - public void add(VariantContext vc, byte refBase) { - add(vc, refBase, false); + public void add(VariantContext vc) { + add(vc, false); } /** * add a record to the file * * @param vc the Variant Context object - * @param refBase the ref base used for indels * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD) */ - public void add(VariantContext vc, byte refBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { + public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) { if ( mHeader == null ) throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString()); @@ -223,7 +221,7 @@ public class StandardVCFWriter implements VCFWriter { vc = VariantContext.modifyGenotypes(vc, null); try { - vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBase, refBaseShouldBeAppliedToEndOfAlleles); + vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles); // if we are doing on the fly indexing, add the record ***before*** we write any bytes if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition()); @@ -285,7 +283,7 @@ public class StandardVCFWriter implements VCFWriter { Map infoFields = new TreeMap(); for ( Map.Entry field : vc.getAttributes().entrySet() ) { String key = field.getKey(); - if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) + if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) ) continue; String outputValue = formatVCFField(field.getValue()); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java index 0d23fe455..55749d26e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFWriter.java @@ -14,5 +14,5 @@ public interface VCFWriter { */ public void close(); - public void add(VariantContext vc, byte refBase); + public void add(VariantContext vc); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 7eab6f6c9..3c3299ff5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -43,6 +44,9 @@ import java.util.Arrays; * Date: Sep 3, 2010 * Time: 2:24:09 PM */ +@DocumentedGATKFeature( + groupName = "User exceptions", + summary = "Exceptions caused by incorrect user behavior, such as bad files, bad arguments, etc." ) public class UserException extends ReviewedStingException { public UserException(String msg) { super(msg); } public UserException(String msg, Throwable e) { super(msg, e); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java deleted file mode 100644 index 65c332048..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DescriptionTaglet.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide an alternate description for the given help system. - * - * @author mhanna - * @version 0.1 - */ -public class DescriptionTaglet extends HelpTaglet { - /** - * The key tag for this taglet. - */ - public static final String NAME = "help.description"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return false since overviews are always named - * by the @WalkerName tag. - * @return false always - */ - @Override - public boolean inOverview() { - return true; - } - - /** - * Will return true to indicate that packages can be given useful - * description. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - DescriptionTaglet tag = new DescriptionTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java deleted file mode 100644 index 6c6dad736..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/DisplayNameTaglet.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide a display name in the help for packages - * - * @author mhanna - * @version 0.1 - */ -public class DisplayNameTaglet extends HelpTaglet { - /** - * The display name for this taglet. - */ - public static final String NAME = "help.display.name"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return true to indicate that packages can be given useful - * display text. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - DisplayNameTaglet tag = new DisplayNameTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java new file mode 100644 index 000000000..710503ca8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeature.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import java.lang.annotation.*; + +/** + * An annotation to identify a class as a GATK capability for documentation + * + * @author depristo + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DocumentedGATKFeature { + public boolean enable() default true; + public String groupName(); + public String summary() default ""; + public Class handler() default GenericDocumentationHandler.class; + public Class[] extraDocs() default {}; +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java new file mode 100644 index 000000000..366df0c3a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/DocumentedGATKFeatureHandler.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; + +import java.io.*; +import java.util.Set; + +/** + * + */ +public abstract class DocumentedGATKFeatureHandler { + private GATKDoclet doclet; + + protected RootDoc getRootDoc() { + return this.doclet.rootDoc; + } + + public void setDoclet(GATKDoclet doclet) { + this.doclet = doclet; + } + + public GATKDoclet getDoclet() { + return doclet; + } + + public boolean shouldBeProcessed(ClassDoc doc) { return true; } + + public String getDestinationFilename(ClassDoc doc) { + return HelpUtils.getClassName(doc).replace(".", "_") + ".html"; + } + + public abstract String getTemplateName(ClassDoc doc) throws IOException; + public abstract void processOne(RootDoc rootDoc, GATKDocWorkUnit toProcess, Set all); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java new file mode 100644 index 000000000..65c6624d5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocWorkUnit.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; + +import java.util.HashMap; +import java.util.Map; + +/** +* Created by IntelliJ IDEA. +* User: depristo +* Date: 7/24/11 +* Time: 7:59 PM +* To change this template use File | Settings | File Templates. +*/ +public class GATKDocWorkUnit implements Comparable { + // known at the start + final String name, filename, group; + final DocumentedGATKFeatureHandler handler; + final ClassDoc classDoc; + final Class clazz; + final DocumentedGATKFeature annotation; + final String buildTimestamp, absoluteVersion; + + // set by the handler + String summary; + Map forTemplate; + + public GATKDocWorkUnit(String name, String filename, String group, + DocumentedGATKFeature annotation, DocumentedGATKFeatureHandler handler, + ClassDoc classDoc, Class clazz, + String buildTimestamp, String absoluteVersion) { + this.annotation = annotation; + this.name = name; + this.filename = filename; + this.group = group; + this.handler = handler; + this.classDoc = classDoc; + this.clazz = clazz; + this.buildTimestamp = buildTimestamp; + this.absoluteVersion = absoluteVersion; + } + + public void setHandlerContent(String summary, Map forTemplate) { + this.summary = summary; + this.forTemplate = forTemplate; + } + + public Map toMap() { + Map data = new HashMap(); + data.put("name", name); + data.put("summary", summary); + data.put("filename", filename); + data.put("group", group); + return data; + } + + public int compareTo(GATKDocWorkUnit other) { + return this.name.compareTo(other.name); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java new file mode 100644 index 000000000..49214237a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; +import freemarker.template.Configuration; +import freemarker.template.DefaultObjectWrapper; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.util.*; + +/** + * + */ +public class GATKDoclet { + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + final protected static File DESTINATION_DIR = new File("gatkdocs"); + final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + protected static String buildTimestamp = null, absoluteVersion = null; + protected static boolean showHiddenFeatures = false; + + RootDoc rootDoc; + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws java.io.IOException if output can't be written. + */ + public static boolean start(RootDoc rootDoc) throws IOException { + logger.setLevel(Level.DEBUG); + // load arguments + for(String[] options: rootDoc.options()) { + if(options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + if (options[0].equals("-include-hidden")) + showHiddenFeatures = true; + } + + GATKDoclet doclet = new GATKDoclet(); + doclet.processDocs(rootDoc); + return true; + } + + /** + * Validate the given options against options supported by this doclet. + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if(option.equals("-build-timestamp") || option.equals("-absolute-version") || option.equals("-include-hidden")) { + return 2; + } + return 0; + } + + public boolean showHiddenFeatures() { + return showHiddenFeatures; + } + + public Set workUnits() { + TreeSet m = new TreeSet(); + + for ( ClassDoc doc : rootDoc.classes() ) { + logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + logger.debug("foo"); + + DocumentedGATKFeature feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if ( handler != null && handler.shouldBeProcessed(doc) ) { + logger.info("Going to generate documentation for class " + doc); + String filename = handler.getDestinationFilename(doc); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), + feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + protected void processDocs(RootDoc rootDoc) { + // setup the global access to the root + this.rootDoc = rootDoc; + + try { + // basic setup + DESTINATION_DIR.mkdirs(); + FileUtils.copyFile(new File(SETTINGS_DIR + "/style.css"), new File(DESTINATION_DIR + "/style.css")); + + /* ------------------------------------------------------------------- */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + Configuration cfg = new Configuration(); + // Specify the data source where the template files come from. + cfg.setDirectoryForTemplateLoading(SETTINGS_DIR); + // Specify how templates will see the data-model. This is an advanced topic... + cfg.setObjectWrapper(new DefaultObjectWrapper()); + + Set myWorkUnits = workUnits(); + for ( GATKDocWorkUnit workUnit : myWorkUnits ) { + processDocWorkUnit(cfg, workUnit, myWorkUnits); + } + + processIndex(cfg, new ArrayList(myWorkUnits)); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeature feature) { + try { + if ( feature != null ) { + if ( feature.enable() ) { + DocumentedGATKFeatureHandler handler = feature.handler().newInstance(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); + } + } + } catch ( IllegalAccessException e) { + throw new RuntimeException(e); // the constructor is now private -- this is an error + } catch ( InstantiationException e) { + throw new RuntimeException(e); // the constructor is now private -- this is an error + } + + return null; + } + + private DocumentedGATKFeature getFeatureForClassDoc(ClassDoc doc) { + // todo -- what do I need the ? extends Object to pass the compiler? + Class docClass = getClassForClassDoc(doc); + if ( docClass != null && docClass.isAnnotationPresent(DocumentedGATKFeature.class) ) { + return docClass.getAnnotation(DocumentedGATKFeature.class); + } else { + return null; // not annotated so it shouldn't be documented + } + } + + private Class getClassForClassDoc(ClassDoc doc) { + try { + // todo -- what do I need the ? extends Object to pass the compiler? + return (Class)HelpUtils.getClassForDoc(doc); + } catch ( ClassNotFoundException e) { + //logger.warn("Couldn't find class for ClassDoc " + doc); + // we got a classdoc for a class we can't find. Maybe in a library or something + return null; + } catch ( NoClassDefFoundError e ) { + return null; + } catch ( UnsatisfiedLinkError e) { + return null; // naughty BWA bindings + } + } + + public static ClassDoc getClassDocForClass(RootDoc rootDoc, Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + private void processIndex(Configuration cfg, List indexData) throws IOException { + /* Get or create a template */ + Template temp = cfg.getTemplate("generic.index.template.html"); + + /* Merge data-model with template */ + Writer out = new OutputStreamWriter(new FileOutputStream(new File(DESTINATION_DIR + "/index.html"))); + try { + temp.process(groupIndexData(indexData), out); + out.flush(); + } catch ( TemplateException e ) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } + + private Map groupIndexData(List indexData) { + // + // root -> data -> { summary -> y, filename -> z }, etc + // -> groups -> group1, group2, etc. + Map root = new HashMap(); + + Collections.sort(indexData); + + Set docFeatures = new HashSet(); + List> data = new ArrayList>(); + for ( GATKDocWorkUnit workUnit : indexData ) { + data.add(workUnit.toMap()); + docFeatures.add(workUnit.annotation); + } + + List> groups = new ArrayList>(); + for ( DocumentedGATKFeature feature : docFeatures ) { + groups.add(toMap(feature)); + } + + root.put("data", data); + root.put("groups", groups); + root.put("timestamp", buildTimestamp); + root.put("version", absoluteVersion); + + return root; + } + + private static final Map toMap(DocumentedGATKFeature annotation) { + Map root = new HashMap(); + root.put("name", annotation.groupName()); + root.put("summary", annotation.summary()); + return root; + } + + public final static GATKDocWorkUnit findWorkUnitForClass(Class c, Set all) { + for ( final GATKDocWorkUnit unit : all ) + if ( unit.clazz.equals(c) ) + return unit; + return null; + } + + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, Set all) + throws IOException { + //System.out.printf("Processing documentation for class %s%n", unit.classDoc); + + unit.handler.processOne(rootDoc, unit, all); + + // Get or create a template + Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); + + // Merge data-model with template + File outputPath = new File(DESTINATION_DIR + "/" + unit.filename); + try { + Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); + temp.process(unit.forTemplate, out); + out.flush(); + } catch ( TemplateException e ) { + throw new ReviewedStingException("Failed to create GATK documentation", e); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java new file mode 100644 index 000000000..c69345816 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.google.java.contract.Requires; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.RootDoc; +import com.sun.javadoc.Tag; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.lang.reflect.Field; +import java.util.*; + +/** + * + */ +public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { + private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); + GATKDocWorkUnit toProcess; + ClassDoc classdoc; + Set all; + RootDoc rootDoc; + + @Override + public boolean shouldBeProcessed(ClassDoc doc) { + return true; +// try { +// Class type = HelpUtils.getClassForDoc(doc); +// return JVMUtils.isConcrete(type); +// } catch ( ClassNotFoundException e ) { +// return false; +// } + } + + + @Override + public String getTemplateName(ClassDoc doc) throws IOException { + return "generic.template.html"; + } + + @Override + public void processOne(RootDoc rootDoc, GATKDocWorkUnit toProcessArg, Set allArg) { + this.rootDoc = rootDoc; + this.toProcess = toProcessArg; + this.all = allArg; + this.classdoc = toProcess.classDoc; + + //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); + Map root = new HashMap(); + + addHighLevelBindings(root); + addArgumentBindings(root); + addRelatedBindings(root); + + toProcess.setHandlerContent((String)root.get("summary"), root); + } + + protected void addHighLevelBindings(Map root) { + root.put("name", classdoc.name()); + + // Extract overrides from the doc tags. + StringBuilder summaryBuilder = new StringBuilder(); + for(Tag tag: classdoc.firstSentenceTags()) + summaryBuilder.append(tag.text()); + root.put("summary", summaryBuilder.toString()); + root.put("description", classdoc.commentText().substring(summaryBuilder.toString().length())); + root.put("timestamp", toProcess.buildTimestamp); + root.put("version", toProcess.absoluteVersion); + + for(Tag tag: classdoc.tags()) { + root.put(tag.name(), tag.text()); + } + } + + protected void addArgumentBindings(Map root) { + ParsingEngine parsingEngine = createStandardGATKParsingEngine(); + + // attempt to instantiate the class + Object instance = makeInstanceIfPossible(toProcess.clazz); + + Map> args = new HashMap>(); + root.put("arguments", args); + args.put("all", new ArrayList()); + args.put("required", new ArrayList()); + args.put("optional", new ArrayList()); + args.put("hidden", new ArrayList()); + args.put("depreciated", new ArrayList()); + try { + for ( ArgumentSource argumentSource : parsingEngine.extractArgumentSources(HelpUtils.getClassForDoc(classdoc)) ) { + ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); + FieldDoc fieldDoc = getFieldDoc(classdoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); // todo -- why can you have multiple ones? + if ( ! argumentSource.isHidden() || getDoclet().showHiddenFeatures() ) { + logger.debug(String.format("Processing %s", argumentSource)); + String kind = "optional"; + if ( argumentSource.isRequired() ) kind = "required"; + else if ( argumentSource.isHidden() ) kind = "hidden"; + else if ( argumentSource.isDeprecated() ) kind = "depreciated"; + + // get the value of the field + if ( instance != null ) { + Object value = getFieldValue(toProcess.clazz, instance, fieldDoc.name()); + if ( value != null ) + argBindings.put("defaultValue", prettyPrintValueString(value)); + } + + args.get(kind).add(argBindings); + args.get("all").add(argBindings); + } else { + logger.debug(String.format("Skipping hidden feature %s", argumentSource)); + } + } + } catch ( ClassNotFoundException e ) { + throw new RuntimeException(e); + } + } + + private Object getFieldValue(Class c, Object instance, String fieldName) { + Field field = JVMUtils.findField(c, fieldName); + if ( field != null ) { + Object value = JVMUtils.getFieldValue(field, instance); + //System.out.printf("Fetched value of field %s in class %s: %s%n", fieldName, c, value); + return value; + } else { + return findFieldValueInArgumentCollections(c, instance, fieldName); + } + } + + private Object findFieldValueInArgumentCollections(Class c, Object instance, String fieldName) { + for ( Field field : JVMUtils.getAllFields(c) ) { + if ( field.isAnnotationPresent(ArgumentCollection.class) ) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue.getClass(), fieldValue, fieldName); + if ( value != null ) + return value; + } + } + + return null; + } + + /** + * Assumes value != null + * @param value + * @return + */ + private Object prettyPrintValueString(Object value) { + if ( value.getClass().isArray() ) { + Class type = value.getClass().getComponentType(); + if ( boolean.class.isAssignableFrom(type) ) + return Arrays.toString((boolean[])value); + if ( byte.class.isAssignableFrom(type) ) + return Arrays.toString((byte[])value); + if ( char.class.isAssignableFrom(type) ) + return Arrays.toString((char[])value); + if ( double.class.isAssignableFrom(type) ) + return Arrays.toString((double[])value); + if ( float.class.isAssignableFrom(type) ) + return Arrays.toString((float[])value); + if ( int.class.isAssignableFrom(type) ) + return Arrays.toString((int[])value); + if ( long.class.isAssignableFrom(type) ) + return Arrays.toString((long[])value); + if ( short.class.isAssignableFrom(type) ) + return Arrays.toString((short[])value); + if ( Object.class.isAssignableFrom(type) ) + return Arrays.toString((Object[])value); + else + throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); + } else + return value.toString(); + } + + private Object makeInstanceIfPossible(Class c) { + Object instance = null; + try { + // don't try to make something where we will obviously fail + if (! c.isEnum() && ! c.isAnnotation() && ! c.isAnonymousClass() && + ! c.isArray() && ! c.isPrimitive() & JVMUtils.isConcrete(c) ) { + instance = c.newInstance(); + //System.out.printf("Created object of class %s => %s%n", c, instance); + return instance; + } else + return null; + } + catch (IllegalAccessException e ) { } + catch (InstantiationException e ) { } + catch (ExceptionInInitializerError e ) { } + catch (SecurityException e ) { } + // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions + // and rethrow then as RuntimeExceptions + catch (RuntimeException e) {} + finally { + if ( instance == null ) + logger.warn(String.format("Unable to create instance of class %s => %s", c, instance)); + } + + return instance; + } + + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for ( final Class extraDocClass : toProcess.annotation.extraDocs() ) { + final GATKDocWorkUnit otherUnit = GATKDoclet.findWorkUnitForClass(extraDocClass, all); + if ( otherUnit == null ) + throw new ReviewedStingException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap(){{ + put("filename", otherUnit.filename); + put("name", otherUnit.name);}}); + + } + + List> hierarchyDocs = new ArrayList>(); + for (final GATKDocWorkUnit other : all ) { + final String relation = classRelationship(toProcess.clazz, other.clazz); + if ( relation != null ) + hierarchyDocs.add( + new HashMap(){{ + put("filename", other.filename); + put("relation", relation); + put("name", other.name);}}); + + } + + root.put("relatedDocs", hierarchyDocs); + root.put("extradocs", extraDocsData); + } + + private static final String classRelationship(Class me, Class other) { + if ( other.equals(me) ) + // no circular references + return null; + else if ( other.isAssignableFrom(me) ) + // toProcess is a superclass of other.clazz + return "superclass"; + else if ( me.isAssignableFrom(other) ) + // toProcess inherits from other.clazz + return "subclass"; + else + return null; + + } + + protected ParsingEngine createStandardGATKParsingEngine() { + CommandLineProgram clp = new CommandLineGATK(); + try { + CommandLineProgram.start(clp, new String[]{}, true); + return clp.parser; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { + return getFieldDoc(classDoc, name, true); + } + + private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { + //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); + for ( FieldDoc fieldDoc : classDoc.fields(false) ) { + //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); + if ( fieldDoc.name().equals(name) ) + return fieldDoc; + + Field field = HelpUtils.getFieldForFieldDoc(fieldDoc); + if ( field.isAnnotationPresent(ArgumentCollection.class) ) { + ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); + if ( typeDoc == null ) + throw new ReviewedStingException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); + else { + FieldDoc result = getFieldDoc(typeDoc, name, false); + if ( result != null ) + return result; + // else keep searching + } + } + } + + // if we didn't find it here, wander up to the superclass to find the field + if ( classDoc.superclass() != null ) { + return getFieldDoc(classDoc.superclass(), name, false); + } + + if ( primary ) + throw new RuntimeException("No field found for expected field " + name); + else + return null; + } + + protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { + Map root = new HashMap(); + root.put("name", def.shortName != null ? "-" + def.shortName : "--" + def.fullName ); + + if ( def.shortName != null && def.fullName != null ) + root.put("synonyms", "--" + def.fullName); + + root.put("required", def.required ? "yes" : "no"); + root.put("type", def.argumentType.getSimpleName()); + + // summary and fulltext + root.put("summary", def.doc != null ? def.doc : ""); + root.put("fulltext", fieldDoc.commentText()); + + List attributes = new ArrayList(); + // this one below is just too much. + //attributes.add(def.ioType.annotationClass.getSimpleName()); + if ( def.required ) attributes.add("required"); + // flag is just boolean, not interesting + //if ( def.isFlag ) attributes.add("flag"); + if ( def.isHidden ) attributes.add("hidden"); + if ( source.isDeprecated() ) attributes.add("depreciated"); + if ( attributes.size() > 0 ) + root.put("attributes", Utils.join(", ", attributes)); + + if ( def.validOptions != null ) { + root.put("options", docForEnumArgument(source.field.getType())); + } + + return root; + } + + @Requires("enumClass.isEnum()") + private List> docForEnumArgument(Class enumClass) { + ClassDoc doc = GATKDoclet.getClassDocForClass(rootDoc, enumClass); + if ( doc == null ) // || ! doc.isEnum() ) + throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got instead: " + doc); + + List> bindings = new ArrayList>(); + for (final FieldDoc field : doc.fields(false) ) { + bindings.add( + new HashMap(){{ + put("name", field.name()); + put("summary", field.commentText());}}); + } + + return bindings; + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java deleted file mode 100644 index b350b1a29..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpTaglet.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.javadoc.Tag; -import com.sun.tools.doclets.Taglet; - -/** - * Basic functionality for the help taglet. - * - * @author mhanna - * @version 0.1 - */ -public abstract class HelpTaglet implements Taglet { - /** - * Return the name of this custom tag. - */ - public abstract String getName(); - - /** - * Will return false since this tag cannot be applied - * to a field. - * @return false since this tag cannot be applied to a field. - */ - public boolean inField() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a constructor. - * @return false since by default, help tags cannot be applied to a constructor. - */ - public boolean inConstructor() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a method. - * @return false since by default, this tag cannot be applied to a method. - */ - public boolean inMethod() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to an overview. - * @return false since by default, help tags cannot be applied to an overview. - */ - public boolean inOverview() { - return false; - } - - /** - * Will return false since by default, help tags cannot be applied to a package. - * description. - * @return false since by default, help tags cannot be applied to a package. - */ - public boolean inPackage() { - return false; - } - - /** - * Will return false since help tags are by default not inline. - * @return false since help tags are by default not inline. - */ - public boolean inType() { - return false; - } - - /** - * Will return false since help tags are by default not inline. - * @return false since help tags are by default not inline. - */ - public boolean isInlineTag() { - return false; - } - - /** - * Create a string representation of this tag. Since this tag is only - * used by the help system, don't output any HTML. - */ - public String toString(Tag tag) { - return null; - } - - /** - * Create a string representation of this tag. Since this tag is only - * used by the help system, don't output any HTML. - */ - public String toString(Tag[] tags) { - return null; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java new file mode 100644 index 000000000..4527c6afe --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.PackageDoc; +import com.sun.javadoc.ProgramElementDoc; +import org.broadinstitute.sting.utils.classloader.JVMUtils; + +import java.lang.reflect.Field; + +public class HelpUtils { + protected static boolean implementsInterface(ProgramElementDoc classDoc, Class... interfaceClasses) { + for (Class interfaceClass : interfaceClasses) + if (assignableToClass(classDoc, interfaceClass, false)) + return true; + return false; + } + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { + try { + Class type = getClassForDoc(classDoc); + return lhsClass.isAssignableFrom(type) && (!requireConcrete || JVMUtils.isConcrete(type)); + } catch (Throwable t) { + // Ignore errors. + return false; + } + } + + protected static Class getClassForDoc(ProgramElementDoc doc) throws ClassNotFoundException { + return Class.forName(getClassName(doc)); + } + + protected static Field getFieldForFieldDoc(FieldDoc fieldDoc) { + try { + Class clazz = getClassForDoc(fieldDoc.containingClass()); + return JVMUtils.findField(clazz, fieldDoc.name()); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Reconstitute the class name from the given class JavaDoc object. + * + * @param doc the Javadoc model for the given class. + * @return The (string) class name of the given class. + */ + protected static String getClassName(ProgramElementDoc doc) { + PackageDoc containingPackage = doc.containingPackage(); + return containingPackage.name().length() > 0 ? + String.format("%s.%s", containingPackage.name(), doc.name()) : + String.format("%s", doc.name()); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java index 6ee12d42e..a28a7bcee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ResourceBundleExtractorDoclet.java @@ -28,14 +28,9 @@ package org.broadinstitute.sting.utils.help; import com.sun.javadoc.*; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.classloader.JVMUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.*; -import java.util.HashSet; -import java.util.Properties; -import java.util.Scanner; -import java.util.Set; +import java.util.*; /** * Extracts certain types of javadoc (specifically package and class descriptions) and makes them available @@ -48,17 +43,21 @@ public class ResourceBundleExtractorDoclet { /** * Taglet for the particular version number. */ - private static final String VERSION_TAGLET_NAME = "version"; + public static final String VERSION_TAGLET_NAME = "version"; + public static final String SUMMARY_TAGLET_NAME = "help.summary"; + public static final String DESCRIPTION_TAGLET_NAME = "help.description"; /** * Maintains a collection of resources in memory as they're accumulated. */ - private static final Properties resourceText = new Properties(); + protected final Properties resourceText = new Properties(); /** * Maintains a collection of classes that should really be documented. */ - private static final Set undocumentedWalkers = new HashSet(); + protected final Set undocumentedWalkers = new HashSet(); + + protected String buildTimestamp = null, absoluteVersion = null; /** * Extracts the contents of certain types of javadoc and adds them to an XML file. @@ -67,26 +66,38 @@ public class ResourceBundleExtractorDoclet { * @throws IOException if output can't be written. */ public static boolean start(RootDoc rootDoc) throws IOException { + ResourceBundleExtractorDoclet doclet = new ResourceBundleExtractorDoclet(); + PrintStream out = doclet.loadData(rootDoc, true); + doclet.processDocs(rootDoc, out); + return true; + } + + protected PrintStream loadData(RootDoc rootDoc, boolean overwriteResourcesFile) { PrintStream out = System.out; - String buildTimestamp = null, versionPrefix = null, versionSuffix = null, absoluteVersion = null; for(String[] options: rootDoc.options()) { if(options[0].equals("-out")) { - loadExistingResourceFile(options[1], rootDoc); - out = new PrintStream(options[1]); + try { + loadExistingResourceFile(options[1], rootDoc); + if ( overwriteResourcesFile ) + out = new PrintStream(options[1]); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } } if(options[0].equals("-build-timestamp")) buildTimestamp = options[1]; - if(options[0].equals("-version-prefix")) - versionPrefix = options[1]; - if(options[0].equals("-version-suffix")) - versionSuffix = options[1]; if (options[0].equals("-absolute-version")) absoluteVersion = options[1]; } resourceText.setProperty("build.timestamp",buildTimestamp); + return out; + } + protected void processDocs(RootDoc rootDoc, PrintStream out) { // Cache packages as we see them, since there's no direct way to iterate over packages. Set packages = new HashSet(); @@ -97,13 +108,19 @@ public class ResourceBundleExtractorDoclet { if(isRequiredJavadocMissing(currentClass) && isWalker(currentClass)) undocumentedWalkers.add(currentClass.name()); - renderHelpText(getClassName(currentClass),currentClass,versionPrefix,versionSuffix,absoluteVersion); + renderHelpText(HelpUtils.getClassName(currentClass),currentClass); } for(PackageDoc currentPackage: packages) - renderHelpText(currentPackage.name(),currentPackage,versionPrefix,versionSuffix,absoluteVersion); + renderHelpText(currentPackage.name(),currentPackage); - resourceText.store(out,"Strings displayed by the Sting help system"); + try { + resourceText.store(out,"Strings displayed by the Sting help system"); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } // ASCII codes for making text blink final String blink = "\u001B\u005B\u0035\u006D"; @@ -111,8 +128,6 @@ public class ResourceBundleExtractorDoclet { if(undocumentedWalkers.size() > 0) Utils.warnUser(String.format("The following walkers are currently undocumented: %s%s%s", blink, Utils.join(" ",undocumentedWalkers), reset)); - - return true; } /** @@ -121,7 +136,7 @@ public class ResourceBundleExtractorDoclet { * @return Number of potential parameters; 0 if not supported. */ public static int optionLength(String option) { - if(option.equals("-build-timestamp") || option.equals("-version-prefix") || option.equals("-version-suffix") || option.equals("-out") || option.equals("-absolute-version") ) { + if(option.equals("-build-timestamp") || option.equals("-out") || option.equals("-absolute-version") ) { return 2; } return 0; @@ -137,7 +152,7 @@ public class ResourceBundleExtractorDoclet { * @throws IOException if there is an I/O-related error other than FileNotFoundException * while attempting to read the resource file. */ - private static void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { + private void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { try { BufferedReader resourceFile = new BufferedReader(new FileReader(resourceFileName)); try { @@ -157,27 +172,8 @@ public class ResourceBundleExtractorDoclet { * @param classDoc the type of the given class. * @return True if the class of the given name is a walker. False otherwise. */ - private static boolean isWalker(ClassDoc classDoc) { - try { - Class type = Class.forName(getClassName(classDoc)); - return Walker.class.isAssignableFrom(type) && JVMUtils.isConcrete(type); - } - catch(Throwable t) { - // Ignore errors. - return false; - } - } - - /** - * Reconstitute the class name from the given class JavaDoc object. - * @param classDoc the Javadoc model for the given class. - * @return The (string) class name of the given class. - */ - private static String getClassName(ClassDoc classDoc) { - PackageDoc containingPackage = classDoc.containingPackage(); - return containingPackage.name().length() > 0 ? - String.format("%s.%s",containingPackage.name(),classDoc.name()) : - String.format("%s",classDoc.name()); + protected static boolean isWalker(ClassDoc classDoc) { + return HelpUtils.assignableToClass(classDoc, Walker.class, true); } /** @@ -186,8 +182,6 @@ public class ResourceBundleExtractorDoclet { * @return True if the JavaDoc is missing. False otherwise. */ private static boolean isRequiredJavadocMissing(ClassDoc classDoc) { - if(classDoc.containingPackage().name().contains("oneoffprojects")) - return false; return classDoc.commentText().length() == 0 || classDoc.commentText().contains("Created by IntelliJ"); } @@ -195,53 +189,23 @@ public class ResourceBundleExtractorDoclet { * Renders all the help text required for a given name. * @param elementName element name to use as the key * @param element Doc element to process. - * @param versionPrefix Text to add to the start of the version string. - * @param versionSuffix Text to add to the end of the version string. */ - private static void renderHelpText(String elementName, Doc element, String versionPrefix, String versionSuffix, String absoluteVersion) { - // Extract overrides from the doc tags. - String name = null; - String version = null; + private void renderHelpText(String elementName, Doc element) { StringBuilder summaryBuilder = new StringBuilder(); for(Tag tag: element.firstSentenceTags()) summaryBuilder.append(tag.text()); String summary = summaryBuilder.toString(); String description = element.commentText(); - for(Tag tag: element.tags()) { - if(tag.name().equals("@"+DisplayNameTaglet.NAME)) { - if(name != null) - throw new ReviewedStingException("Only one display name tag can be used per package / walker."); - name = tag.text(); - } - else if(tag.name().equals("@"+VERSION_TAGLET_NAME)) { - if ( absoluteVersion != null ) { - version = absoluteVersion; - } - else { - version = String.format("%s%s%s", (versionPrefix != null) ? versionPrefix : "", - tag.text(), - (versionSuffix != null) ? versionSuffix : ""); - } - } - else if(tag.name().equals("@"+SummaryTaglet.NAME)) - summary = tag.text(); - else if(tag.name().equals("@"+DescriptionTaglet.NAME)) - description = tag.text(); - } - - // Write out an alternate element name, if exists. - if(name != null) - resourceText.setProperty(String.format("%s.%s",elementName,DisplayNameTaglet.NAME),name); - - if(version != null) - resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),version); + // this might seem unnecessary, but the GATK command line program uses this tag to determine the version when running + if(absoluteVersion != null) + resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),absoluteVersion); // Write out an alternate element summary, if exists. - resourceText.setProperty(String.format("%s.%s",elementName,SummaryTaglet.NAME),formatText(summary)); + resourceText.setProperty(String.format("%s.%s",elementName,SUMMARY_TAGLET_NAME),formatText(summary)); // Write out an alternate description, if present. - resourceText.setProperty(String.format("%s.%s",elementName,DescriptionTaglet.NAME),formatText(description)); + resourceText.setProperty(String.format("%s.%s",elementName,DESCRIPTION_TAGLET_NAME),formatText(description)); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java b/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java deleted file mode 100644 index db8b55940..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/help/SummaryTaglet.java +++ /dev/null @@ -1,58 +0,0 @@ -package org.broadinstitute.sting.utils.help; - -import com.sun.tools.doclets.Taglet; - -import java.util.Map; - -/** - * Provide an alternate brief summary for this walker / package. - * Acts as an alternative to the first sentence employed by default. - * @author mhanna - * @version 0.1 - */ -public class SummaryTaglet extends HelpTaglet { - /** - * The key tag for this taglet. - */ - public static final String NAME = "help.summary"; - - /** - * Return the name of this custom tag. - */ - @Override - public String getName() { - return NAME; - } - - /** - * Will return false since overviews are always named - * by the @WalkerName tag. - * @return false always - */ - @Override - public boolean inOverview() { - return true; - } - - /** - * Will return true to indicate that packages can be given useful summary. - * @return true always - */ - @Override - public boolean inPackage() { - return true; - } - - /** - * Register this Taglet. - * @param tagletMap the map to register this tag to. - */ - public static void register(Map tagletMap) { - SummaryTaglet tag = new SummaryTaglet(); - Taglet t = (Taglet)tagletMap.get(tag.getName()); - if (t != null) { - tagletMap.remove(tag.getName()); - } - tagletMap.put(tag.getName(), tag); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java index 1d4251542..3159f3fb7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/TextFormattingUtils.java @@ -116,4 +116,57 @@ public class TextFormattingUtils { return bundle; } + + /** + * Returns the word starting positions within line, excluding the first position 0. + * The returned list is compatible with splitFixedWidth. + * @param line Text to parse. + * @return the word starting positions within line, excluding the first position 0. + */ + public static List getWordStarts(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + List starts = new ArrayList(); + int stop = line.length(); + for (int i = 1; i < stop; i++) + if (Character.isWhitespace(line.charAt(i-1))) + if(!Character.isWhitespace(line.charAt(i))) + starts.add(i); + return starts; + } + + /** + * Parses a fixed width line of text. + * @param line Text to parse. + * @param columnStarts the column starting positions within line, excluding the first position 0. + * @return The parsed string array with each entry trimmed. + */ + public static String[] splitFixedWidth(String line, List columnStarts) { + if (line == null) + throw new ReviewedStingException("line is null"); + if (columnStarts == null) + throw new ReviewedStingException("columnStarts is null"); + int startCount = columnStarts.size(); + String[] row = new String[startCount + 1]; + if (startCount == 0) { + row[0] = line.trim(); + } else { + row[0] = line.substring(0, columnStarts.get(0)).trim(); + for (int i = 1; i < startCount; i++) + row[i] = line.substring(columnStarts.get(i - 1), columnStarts.get(i)).trim(); + row[startCount] = line.substring(columnStarts.get(startCount - 1)).trim(); + } + return row; + } + + /** + * Parses a line of text by whitespace. + * @param line Text to parse. + * @return The parsed string array. + */ + public static String[] splitWhiteSpace(String line) { + if (line == null) + throw new ReviewedStingException("line is null"); + return line.trim().split("\\s+"); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java deleted file mode 100644 index d16c19130..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ClosableReentrantLock.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.util.concurrent.locks.ReentrantLock; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 1/19/11 - * Time: 9:50 AM - * - * Simple extension of a ReentrantLock that supports a close method. - */ -public class ClosableReentrantLock extends ReentrantLock { - public boolean ownsLock() { return super.isHeldByCurrentThread(); } - public void close() {} -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java deleted file mode 100644 index 3763ec67d..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/FileBackedGenomeLocProcessingTracker.java +++ /dev/null @@ -1,114 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.*; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Keeps a copy of the processing locks in a file - */ -public class FileBackedGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - private static final Logger logger = Logger.getLogger(FileBackedGenomeLocProcessingTracker.class); - private static final boolean DEBUG = false; - private static final String READ_MODE = "r"; - private static final String WRITE_MODE = "rws"; - - private final File sharedFile; - private final GenomeLocParser parser; - private long lastReadPosition = 0; - - public FileBackedGenomeLocProcessingTracker(File sharedFile, GenomeLocParser parser, ClosableReentrantLock lock, PrintStream status) { - super(lock, status); - - this.sharedFile = sharedFile; - this.parser = parser; - } - - private RandomAccessFile openFile(String mode) { - try { - return new RandomAccessFile(sharedFile, mode); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } - } - - private void closeFile(RandomAccessFile raFile) { - try { - if ( raFile != null ) raFile.close(); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } - } - - @Override - protected List readNewLocs() { - List newPLocs = new ArrayList(); // todo -- gratitous object creation - - if ( sharedFile.exists() ) { - RandomAccessFile raFile = null; - try { - raFile = openFile(READ_MODE); - //logger.warn(String.format("Reading new locs at: file.length=%d last=%d", raFile.length(), lastReadPosition)); - if ( raFile.length() > lastReadPosition ) { - raFile.seek(lastReadPosition); - - int counter = 0; - String line = raFile.readLine(); // Read another line - while ( line != null ) { - String[] parts = line.split(" "); - if ( parts.length != 2 ) throw new ReviewedStingException("BUG: bad sharedFile line '" + line + "' at " + raFile.getFilePointer()); - ProcessingLoc ploc = new ProcessingLoc(parser.parseGenomeLoc(parts[0]), parts[1]); - //logger.warn(" Read " + ploc); - newPLocs.add(ploc); - line = raFile.readLine(); - counter++; - } - lastReadPosition = raFile.getFilePointer(); - if ( DEBUG ) logger.warn(String.format("Read %s locs from file, current pos is %d, # read new locs is %d", - counter, lastReadPosition, newPLocs.size())); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(sharedFile, e); - } catch (IOException e) { - throw new ReviewedStingException("Couldn't read sharedFile " + sharedFile, e); - } finally { - closeFile(raFile); - } - } - - return newPLocs; - } - - @Override - protected void registerNewLocs(Collection plocs) { - RandomAccessFile raFile = null; - - try { - raFile = openFile(WRITE_MODE); - long startPos = raFile.getFilePointer(); - raFile.seek(raFile.length()); - StringBuffer bytes = new StringBuffer(); - for ( ProcessingLoc ploc : plocs ) { - String packet = String.format("%s %s%n", ploc.getLocation(), ploc.getOwner()); - bytes.append(packet); - if ( DEBUG ) logger.warn(String.format("Wrote loc %s to file: %d + %d bytes ending at %d", ploc, startPos, packet.length(), raFile.getFilePointer())); - } - raFile.write(bytes.toString().getBytes()); - //raFile.getChannel().force(true); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(sharedFile, e); - } finally { - closeFile(raFile); - } - } -} - - diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java deleted file mode 100644 index e97a73fb8..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTracker.java +++ /dev/null @@ -1,486 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.text.SimpleDateFormat; -import java.util.*; - -/** - * Abstract base class to coordinating data processing by a collecting for processes / threads. - * - * Conceptually, the genome is viewed as a collection of non-overlapping genome location: - * - * chr1:1-10 - * chr1:11-20 - * chr1:21-30 - * etc. - * - * This class, and it's concrete derived classes, provide the ability to claim individual locations - * as "mine", and exclude other processes / threads from processing them. At the lowest-level this - * is implemented by the claimOwnership(loc, name) function, that returns true if loc free (unclaimed) - * and makes name the owner of loc. High-level, and more efficient operations provide claiming - * iterators over streams of objects implementing the HasGenomeLocation interface, so that you can - * write code that looks like: - * - * for ( GenomeLoc ownedLoc : onlyOwned(allLocsToProcess.iterator) ) { - * doSomeWork(ownedLoc) - * - * Much of the code in this class is actually surrounding debugging and performance metrics code. - * The actual synchronization code is separated out into the ClosableReentrantLock() system - * and the two abstract functions: - * - * protected abstract void registerNewLocs(Collection plocs); - * protected abstract Collection readNewLocs(); - * - * That maintain the state of the tracker. - * - * That is, the ProcessingTracker is made of two components: a thread / process locking system and - * a subclass that implements the methods to record new claimed state changes and to read out updates - * that may have occurred by another thread or process. - * - * NOTE: this class assumes that all threads / processes are working with the same set of potential - * GenomeLocs to own. Claiming chr1:1-10 and then chr1:5-6 is allowed by the system. Basically, - * you only can stake claim to GenomeLocs that are .equal(). - */ -public abstract class GenomeLocProcessingTracker { - private final static Logger logger = Logger.getLogger(FileBackedGenomeLocProcessingTracker.class); - private final static SimpleDateFormat STATUS_FORMAT = new SimpleDateFormat("HH:mm:ss,SSS"); - private final static int DEFAULT_OWNERSHIP_ITERATOR_SIZE = 1; - - /** - * Useful state strings for printing status - */ - private final static String GOING_FOR_LOCK = "going_for_lock"; - private final static String RELEASING_LOCK = "releasing_lock"; - private final static String HAVE_LOCK = "have_lock"; - private final static String RUNNING = "running"; - - /** - * A map, for efficiency, that allows quick lookup of the processing loc for a - * given GenomeLoc. The map points from loc -> loc / owner as a ProcessingLoc - */ - private final Map processingLocs; - - /** - * The locking object used to protect data from simulatanous access by multiple - * threads or processes. - */ - private final ClosableReentrantLock lock; - - /** A stream for writing status messages. Can be null if we aren't writing status */ - private final PrintStream status; - - // - // Timers for recording performance information - // Note -- these cannot be used because this class isn't thread safe, and neither are the - // timers, so they result in invalid operations w.r.t. the SimpleTimer contract - // -// protected final SimpleTimer writeTimer = new SimpleTimer("writeTimer"); -// protected final SimpleTimer readTimer = new SimpleTimer("readTimer"); -// protected final SimpleTimer lockWaitTimer = new SimpleTimer("lockWaitTimer"); - protected final SimpleTimer timer = new SimpleTimer(); - protected long nLocks = 0, nWrites = 0, nReads = 0; - - // -------------------------------------------------------------------------------- - // - // Creating ProcessingTrackers - // - // -------------------------------------------------------------------------------- - public GenomeLocProcessingTracker(ClosableReentrantLock lock, PrintStream status) { - this.processingLocs = new HashMap(); - this.status = status; - this.lock = lock; - printStatusHeader(); - } - - // -------------------------------------------------------------------------------- - // - // Code to override to change the dynamics of the the GenomeLocProcessingTracker - // - // -------------------------------------------------------------------------------- - - protected void close() { - lock.close(); - if ( status != null ) status.close(); - } - - /** - * Takes a collection of newly claimed (i.e., previous unclaimed) genome locs - * and the name of their owner and "registers" this data in some persistent way that's - * visible to all threads / processes communicating via this GenomeLocProcessingTracker. - * - * Could be a in-memory data structure (a list) if we are restricting ourselves to intra-memory - * parallelism, a locked file on a shared file system, or a server we communicate with. - * - * @param plocs - */ - protected abstract void registerNewLocs(Collection plocs); - - /** - * The inverse of the registerNewLocs() function. Looks at the persistent data store - * shared by all threads / processes and returns the ones that have appeared since the last - * call to readNewLocs(). Note that we expect the pair of registerNewLocs and readNewLocs to - * include everything, even locs registered by this thread / process. For example: - * - * readNewLocs() => List() - * registerNewLocs(List(x, y,)) => void - * readNewLocs() => List(x,y)) - * - * even for this thread or process. - * @return - */ - protected abstract Collection readNewLocs(); - - - // -------------------------------------------------------------------------------- - // - // Code to claim intervals for processing and query for their ownership - // - // -------------------------------------------------------------------------------- - - /** - * Queries the current database if a location is owned. Does not guarantee that the - * loc can be owned in a future call, though. - * - * @param loc - * @return - */ - public final boolean locIsOwned(GenomeLoc loc, String id) { - return findOwner(loc, id) != null; - } - - /** - * The workhorse routine. Attempt to claim processing ownership of loc, with my name. - * This is an atomic operation -- other threads / processes will wait until this function - * returns. The return result is the ProcessingLoc object describing who owns this - * location. If the location isn't already claimed and we now own the location, the pl owner - * will be myName. Otherwise, the name of the owner can found in the pl. - * - * @param loc - * @param myName - * @return - */ - public final ProcessingLoc claimOwnership(final GenomeLoc loc, final String myName) { - // processingLocs is a shared memory synchronized object, and this - // method is synchronized, so we can just do our processing - return new WithLock(myName) { - public ProcessingLoc doBody() { - ProcessingLoc owner = findOwner(loc, myName); - if ( owner == null ) { // we are unowned - owner = new ProcessingLoc(loc, myName); - registerNewLocsWithTimers(Arrays.asList(owner), myName); - } - return owner; - } - }.run(); - } - - - // -------------------------------------------------------------------------------- - // - // High-level iterator-style interface to claiming ownership - // - // -------------------------------------------------------------------------------- - - /** - * A higher-level, and more efficient, interface to obtain the next location we own. Takes an - * iterator producing objects that support the getLocation() interface, and returns the next - * object in that stream that we can claim ownership of. Returns null if we run out of elements - * during the iteration. - * - * Can be more efficiently implemented in subclasses to avoid multiple unlocking - * - * @param iterator - * @param myName - * @return - */ - public final T claimOwnershipOfNextAvailable(Iterator iterator, String myName) { - OwnershipIterator myIt = new OwnershipIterator(iterator, myName, 1); - return myIt.next(); - } - - public final Iterable onlyOwned(Iterator iterator, String myName) { - return new OwnershipIterator(iterator, myName); - } - - private final class OwnershipIterator implements Iterator, Iterable { - private final Iterator subit; - private final String myName; - private final Queue cache; - private final int cacheSize; - - public OwnershipIterator(Iterator subit, String myName) { - this(subit, myName, DEFAULT_OWNERSHIP_ITERATOR_SIZE); - } - - public OwnershipIterator(Iterator subit, String myName, int cacheSize) { - this.subit = subit; - this.myName = myName; - cache = new LinkedList(); - this.cacheSize = cacheSize; - } - - /** - * Will return true for all elements of subit, even if we can't get ownership of some of the future - * elements and so will return null there - * @return - */ - public final boolean hasNext() { - return cache.peek() != null || subit.hasNext(); - } - - /** - * High performance iterator that only locks and unlocks once per claimed object found. Avoids - * locking / unlocking for each query - * - * @return an object of type T owned by this thread, or null if none of the remaining object could be claimed - */ - public final T next() { - if ( cache.peek() != null) - return cache.poll(); - else { - // cache is empty, we need to fill up the cache and return the first element of the queue - return new WithLock(myName) { - public T doBody() { - // read once the database of owners at the start - updateAndGetProcessingLocs(myName); - - boolean done = false; - Queue pwns = new LinkedList(); // ;-) - while ( !done && cache.size() < cacheSize && subit.hasNext() ) { - final T elt = subit.next(); - GenomeLoc loc = elt.getLocation(); - - ProcessingLoc owner = processingLocs.get(loc); - - if ( owner == null ) { // we are unowned - owner = new ProcessingLoc(loc, myName); - pwns.offer(owner); - if ( ! cache.offer(elt) ) throw new ReviewedStingException("Cache offer unexpectedly failed"); - if ( GenomeLoc.isUnmapped(loc) ) done = true; - } - // if not, we continue our search - } - - registerNewLocsWithTimers(pwns, myName); - - // we've either filled up the cache or run out of elements. Either way we return - // the first element of the cache. If the cache is empty, we return null here. - return cache.poll(); - } - }.run(); - } - } - - public final void remove() { - throw new UnsupportedOperationException(); - } - - public final Iterator iterator() { - return this; - } - } - - // -------------------------------------------------------------------------------- - // - // private / protected low-level accessors / manipulators and utility functions - // - // -------------------------------------------------------------------------------- - - /** - * Useful debugging function that returns the ProcessingLoc who owns loc. ID - * is provided for debugging purposes - * @param loc - * @param id - * @return - */ - protected final ProcessingLoc findOwner(GenomeLoc loc, String id) { - // fast path to check if we already have the existing genome loc in memory for ownership claims - // getProcessingLocs() may be expensive [reading from disk, for example] so we shouldn't call it - // unless necessary - ProcessingLoc x = processingLocs.get(loc); - return x == null ? updateAndGetProcessingLocs(id).get(loc) : x; - } - - /** - * Returns the list of currently owned locations, updating the database as necessary. - * DO NOT MODIFY THIS MAP! As with all parallelizing data structures, the list may be - * out of date immediately after the call returns, or may be updating on the fly. - * @return - */ - protected final Map updateAndGetProcessingLocs(String myName) { - return new WithLock>(myName) { - public Map doBody() { -// readTimer.restart(); - for ( ProcessingLoc p : readNewLocs() ) - processingLocs.put(p.getLocation(), p); -// readTimer.stop(); - nReads++; - return processingLocs; - } - }.run(); - } - - /** - * Wrapper around registerNewLocs that also times the operation - * - * @param plocs - * @param myName - */ - protected final void registerNewLocsWithTimers(Collection plocs, String myName) { -// writeTimer.restart(); - registerNewLocs(plocs); - nWrites++; -// writeTimer.stop(); - } - - private final void printStatusHeader() { - if ( status != null ) status.printf("process.id\thr.time\ttime\tstate%n"); - } - - private final void printStatus(String id, long machineTime, String state) { - // prints a line like processID human-readable-time machine-time state - if ( status != null ) { - status.printf("%s\t%s\t%d\t%s%n", id, STATUS_FORMAT.format(machineTime), machineTime, state); - status.flush(); - } - } - - - /** - * Lock the data structure, preventing other threads / processes from reading and writing to the - * common store - * @param id the name of the process doing the locking - */ - private final void lock(String id) { - //lockWaitTimer.restart(); - boolean hadLock = lock.ownsLock(); - if ( ! hadLock ) { - nLocks++; - //printStatus(id, lockWaitTimer.currentTime(), GOING_FOR_LOCK); - } - lock.lock(); - //lockWaitTimer.stop(); - //if ( ! hadLock ) printStatus(id, lockWaitTimer.currentTime(), HAVE_LOCK); - } - - /** - * Unlock the data structure, allowing other threads / processes to read and write to the common store - * @param id the name of the process doing the unlocking - */ - private final void unlock(String id) { - if ( lock.getHoldCount() == 1 ) printStatus(id, timer.currentTime(), RELEASING_LOCK); - lock.unlock(); - if ( ! lock.ownsLock() ) printStatus(id, timer.currentTime(), RUNNING); - } - - // useful code for getting - public final long getNLocks() { return nLocks; } - public final long getNReads() { return nReads; } - public final long getNWrites() { return nWrites; } -// public final double getTimePerLock() { return lockWaitTimer.getElapsedTime() / Math.max(nLocks, 1); } -// public final double getTimePerRead() { return readTimer.getElapsedTime() / Math.max(nReads,1); } -// public final double getTimePerWrite() { return writeTimer.getElapsedTime() / Math.max(nWrites,1); } - - // -------------------------------------------------------------------------------- - // - // Java-style functional form for with lock do { x }; - // - // -------------------------------------------------------------------------------- - - /** - * Private utility class that executes doBody() method with the lock() acquired and - * handles property unlock()ing the system, even if an error occurs. Allows one to write - * clean code like: - * - * new WithLock(name) { - * public Integer doBody() { doSomething(); return 1; } - * }.run() - * - * @param the return type of the doBody() method - */ - private abstract class WithLock { - private final String myName; - - public WithLock(String myName) { - this.myName = myName; - } - - protected abstract T doBody(); - - public T run() { - boolean locked = false; - try { - lock(myName); - locked = true; - return doBody(); - } finally { - if (locked) unlock(myName); - } - } - } - - // -------------------------------------------------------------------------------- - // - // main function for testing performance - // - // -------------------------------------------------------------------------------- - public static void main(String[] args) { - //BasicConfigurator.configure(); - - final String ref = args[0]; - final File file = new File(args[1]); - final int cycles = Integer.valueOf(args[2]); - - File referenceFile = new File(ref); - try { - final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(referenceFile); - final String chr1 = fasta.getSequenceDictionary().getSequence(1).getSequenceName(); - final GenomeLocParser genomeLocParser = new GenomeLocParser(fasta); - - final class MyTest { - String name; - GenomeLocProcessingTracker tracker; - - MyTest(String name, GenomeLocProcessingTracker tracker) { - this.name = name; - this.tracker = tracker; - } - - public void execute(int cycles) { - SimpleTimer delta = new SimpleTimer("delta"); - SimpleTimer timer = new SimpleTimer("none"); - - if ( file.exists() ) file.delete(); - timer.start(); - delta.start(); - for ( int i = 1; i < cycles; i++ ) { - tracker.claimOwnership(genomeLocParser.createGenomeLoc(chr1, i, i+1), "ABCDEFGHIJKL"); - if ( i % 1000 == 0 ) { - System.out.printf("%s\t%d\t%d\t%.4f\t%.4f%n", name, i, timer.currentTime(), timer.getElapsedTime(), delta.getElapsedTime() ); - delta.restart(); - } - } - } - } - - System.out.printf("name\tcycle\tcurrent.time\telapsed.time\tdelta%n"); - new MyTest("in-memory", new SharedMemoryGenomeLocProcessingTracker(new ClosableReentrantLock())).execute(cycles); - new MyTest("nio", new FileBackedGenomeLocProcessingTracker(file, genomeLocParser, new ClosableReentrantLock(), null)).execute(cycles); - new MyTest("nio-file-lock", new FileBackedGenomeLocProcessingTracker(file, genomeLocParser, new SharedFileThreadSafeLock(file,1), null)).execute(cycles); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java deleted file mode 100644 index ad2a6d31b..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/NoOpGenomeLocProcessingTracker.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * Base class, and null tracker. Always says that a GenomeLoc is ready for processing. It is - * critical that this class already return that a loc is owned, no matter if it's been seen before, - * etc. ReadShards can differ in their contents but have the same "unmapped" genome loc - */ -public class NoOpGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - public NoOpGenomeLocProcessingTracker() { - super(new ClosableReentrantLock(), null); - } - - @Override - protected void registerNewLocs(Collection loc) { - ; - } - - @Override - protected List readNewLocs() { - return Collections.emptyList(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java b/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java deleted file mode 100644 index ee2283dcf..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/ProcessingLoc.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 1/19/11 - * Time: 8:06 AM - * - * Information about processing locations and their owners. Contains two basic data, associated - * together. The first is a genome loc, and the second is the name of the owner, as a string. - * - * chr1:1-10 Mark - * chr2:11-20 DePristo - * - * would be two ProcessingLocs that first indicate that the first 10 bp of chr1 are owned by Mark, - * and the second is owned by DePristo. - */ -public class ProcessingLoc implements HasGenomeLocation { - private final GenomeLoc loc; - private final String owner; - - /** - * Create a loc that's already owned - * @param loc - * @param owner - */ - public ProcessingLoc(GenomeLoc loc, String owner) { - if ( loc == null || owner == null ) { - throw new ReviewedStingException("BUG: invalid ProcessingLoc detected: " + loc + " owner " + owner); - } - - this.loc = loc; - this.owner = owner.intern(); // reduce memory consumption by interning the string - } - - public GenomeLoc getLocation() { - return loc; - } - - public String getOwner() { - return owner; - } - - /** - * Returns true iff the owner of this processing loc is name. Can be used to determine - * the owner of this processing location. - * - * @param name - * @return - */ - public boolean isOwnedBy(String name) { - return getOwner().equals(name); - } - - public String toString() { return String.format("ProcessingLoc(%s,%s)", loc, owner); } - - public boolean equals(Object other) { - if (other instanceof ProcessingLoc ) - return this.loc.equals(((ProcessingLoc)other).loc) && this.owner.equals(((ProcessingLoc)other).owner); - else - return false; - } - - public int compareTo(ProcessingLoc other) { - return this.getLocation().compareTo(other.getLocation()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java deleted file mode 100644 index 0f47da413..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileLock.java +++ /dev/null @@ -1,171 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.apache.lucene.store.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; - -/** - * User: depristo - * Date: 1/19/11 - * Time: 8:24 AM - * - * A reentrant lock for a shared file common file in the file system. Relies on a a Lucene SimpleFSLock - * to manage on disk file locking. - */ -public class SharedFileLock extends ClosableReentrantLock { // todo -- kinda gross inheritance. The super lock is never used - private static Logger logger = Logger.getLogger(SharedFileLock.class); - - private static final String VERIFY_HOST = System.getProperty("verify.host", "gsa1"); - private static final boolean VERIFY = false; - private static final int VERIFY_PORT = 5050; - - // 5 minutes => 360 seconds of trying -> failure - protected static final int DEFAULT_N_TRIES = 1000; - protected static final long DEFAULT_MILLISECONDS_PER_TRY = 360; - - /** The file we are locking */ - private final File file; - - private final LockFactory lockFactory; - private Lock fileLock = null; - - /** - * A counter that indicates the number of 'locks' on this file. - * If locks == 2, then two unlocks are required - * before any resources are freed. - */ - int fileLockReentrantCounter = 0; - - // type of locking - private final int nRetries; - private final long milliSecPerTry; - - /** - * Create a SharedFileThreadSafeLock object locking the file - * @param file - */ - public SharedFileLock(File file, int nRetries, long milliSecPerTry, int ID) { - super(); - this.file = file; - this.nRetries = nRetries; - this.milliSecPerTry = milliSecPerTry; - - File lockDir = new File(file.getParent() == null ? "./" : file.getParent()); - try { - LockFactory factory = new SimpleFSLockFactory(lockDir); - if ( VERIFY ) { // don't forget to start up the VerifyLockServer - this.lockFactory = new VerifyingLockFactory((byte)ID, factory, VERIFY_HOST, VERIFY_PORT); - } else { - this.lockFactory = factory; - } - } catch (IOException e) { - throw new UserException.CouldNotCreateOutputFile(lockDir, "Could not create coordination file locking directory " + lockDir, e); - } - } - - public SharedFileLock(File file, int ID) { - this(file, DEFAULT_N_TRIES, DEFAULT_MILLISECONDS_PER_TRY, ID); - } - - @Override - public void close() { - if ( ownsLock() ) throw new ReviewedStingException("closing SharedFileLock while still owned: ownership count " + fileLockReentrantCounter); - } - - @Override - public int getHoldCount() { - return fileLockReentrantCounter; - } - - @Override - public boolean ownsLock() { - return fileLockReentrantCounter > 0; - } - - // ------------------------------------------------------------------------------------------ - // - // workhorse routines -- acquiring file locks - // - // ------------------------------------------------------------------------------------------ - - private boolean obtainFileLock() throws IOException { - // annoying bug work around for verifylockserver - if ( VERIFY ) - try { - return fileLock.obtain(1); - } catch ( LockObtainFailedException e ) { - return false; - } - else - return fileLock.obtain(); - } - - /** - * Two stage [threading then file] locking mechanism. Reenterant in that multiple lock calls will be - * unwound appropriately. Uses file channel lock *after* thread locking. - */ - @Override - public void lock() { - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" lock() " + Thread.currentThread().getName() + ", fileLockReentrantCounter = " + fileLockReentrantCounter); - if ( fileLockReentrantCounter++ == 0 ) { - // Precondition -- lock is always null while we don't have a lock - if ( fileLock != null ) - throw new ReviewedStingException("BUG: lock() function called when a lock already is owned!"); - - int i = 1; - fileLock = lockFactory.makeLock(file.getName() + ".lock"); - try { - boolean obtained = obtainFileLock(); // todo -- maybe use intrinsic lock features - for ( ; ! obtained && i < nRetries; i++ ) { - try { - //logger.warn("tryLock failed on try " + i + ", waiting " + milliSecPerTry + " millseconds for retry"); - Thread.sleep(milliSecPerTry); - } catch ( InterruptedException e ) { - throw new UserException("SharedFileThreadSafeLock interrupted during wait for file lock", e); - } - obtained = obtainFileLock(); // gross workaround for error in verify server - } - - if ( i > 1 ) logger.warn("tryLock required " + i + " tries before completing, waited " + i * milliSecPerTry + " millseconds"); - - if ( ! obtained ) { - fileLock = null; - // filelock == null -> we never managed to acquire the lock! - throw new UserException("SharedFileThreadSafeLock failed to obtain the lock after " + nRetries + " attempts"); - } - - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" lock() " + Thread.currentThread().getName() + ", obtained = " + obtained + ", tries = " + i); - } catch (IOException e) { - fileLock = null; - throw new ReviewedStingException("Coordination file could not be created because a lock could not be obtained.", e); - } - } - } - - @Override - public void unlock() { - // update for reentrant unlocking - if ( fileLock == null ) throw new ReviewedStingException("BUG: file lock is null -- file lock was not obtained"); - if ( fileLockReentrantCounter <= 0 ) throw new ReviewedStingException("BUG: file lock counter < 0"); - - // this unlock counts as 1 unlock. If this is our last unlock, actually do something - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", count = " + fileLockReentrantCounter); - if ( --fileLockReentrantCounter == 0 ) { - try { - if ( ! fileLock.isLocked() ) throw new ReviewedStingException("BUG: call to unlock() when we don't have a valid lock!"); - fileLock.release(); - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", actually releasing"); - } catch ( IOException e ) { - throw new ReviewedStingException("Could not free file lock on file " + file, e); - } finally { // make sure we null out the filelock, regardless of our state - fileLock = null; - } - } else { - if ( SharedFileThreadSafeLock.DEBUG ) logger.warn(" unlock() " + Thread.currentThread().getName() + ", skipping, count = " + fileLockReentrantCounter); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java deleted file mode 100644 index d70879a0a..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedFileThreadSafeLock.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; - -/** - * User: depristo - * Date: 1/19/11 - * Time: 8:24 AM - * - * A reentrant lock that supports multi-threaded locking as well as a shared file lock on a common - * file in the file system. It itself a shared memory reenterant lock to managed thread safety and - * contains a SharedFileLock to handle the file integrity. - */ -public class SharedFileThreadSafeLock extends ClosableReentrantLock { - private static Logger logger = Logger.getLogger(SharedFileThreadSafeLock.class); - protected static final boolean DEBUG = false; - - private final SharedFileLock fileLock; - - /** - * Create a SharedFileThreadSafeLock object locking the file - * @param file - */ - public SharedFileThreadSafeLock(File file, int nRetries, long milliSecPerTry, int ID) { - super(); - this.fileLock = new SharedFileLock(file, nRetries, milliSecPerTry, ID); - } - - public SharedFileThreadSafeLock(File file, int ID) { - this(file, SharedFileLock.DEFAULT_N_TRIES, SharedFileLock.DEFAULT_MILLISECONDS_PER_TRY, ID); - } - - @Override - public void close() { - super.close(); - fileLock.close(); - } - - @Override - public int getHoldCount() { - if ( super.getHoldCount() != fileLock.getHoldCount() ) - throw new ReviewedStingException("BUG: unequal hold counts. threadlock = " + super.getHoldCount() + ", filelock = " + fileLock.getHoldCount()); - return super.getHoldCount(); - } - - @Override - public boolean ownsLock() { - return super.isHeldByCurrentThread() && fileLock.ownsLock(); - } - - /** - * Two stage [threading then file] locking mechanism. Reenterant in that multiple lock calls will be - * unwound appropriately. Uses file channel lock *after* thread locking. - */ - @Override - public void lock() { - if ( DEBUG ) logger.warn("Attempting SharedFileThreadSafe lock: " + Thread.currentThread().getName()); - if ( DEBUG ) logger.warn(" going for thread lock: " + Thread.currentThread().getName()); - super.lock(); - if ( DEBUG ) logger.warn(" going for file lock: " + Thread.currentThread().getName()); - fileLock.lock(); // todo -- should this be in a try? - } - - @Override - public void unlock() { - if ( DEBUG ) logger.warn(" releasing filelock: " + Thread.currentThread().getName()); - fileLock.unlock(); - if ( DEBUG ) logger.warn(" releasing threadlock: " + Thread.currentThread().getName()); - super.unlock(); - if ( DEBUG ) logger.warn(" unlock() complete: " + Thread.currentThread().getName()); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java b/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java deleted file mode 100644 index 9bf8b58b1..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/SharedMemoryGenomeLocProcessingTracker.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.utils.threading; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Thread-safe shared memory only implementation. Uses a simple list to manage the newly - * added processing locations. - */ -public class SharedMemoryGenomeLocProcessingTracker extends GenomeLocProcessingTracker { - private List newPLocs = new ArrayList(); - - protected SharedMemoryGenomeLocProcessingTracker(ClosableReentrantLock lock) { - super(lock, null); - } - - protected SharedMemoryGenomeLocProcessingTracker(ClosableReentrantLock lock, PrintStream status) { - super(lock, status); - } - - @Override - protected void registerNewLocs(Collection plocs) { - newPLocs.addAll(plocs); - } - - @Override - protected List readNewLocs() { - List r = newPLocs; - newPLocs = new ArrayList(); - return r; - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java index a191670a4..a752f4a1b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableVariantContext.java @@ -27,15 +27,15 @@ public class MutableVariantContext extends VariantContext { } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes) { - this(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); + super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null); } public MutableVariantContext(VariantContext parent) { - this(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes()); + super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel()); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index eab392c4d..fff1961c6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -5,6 +5,7 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -163,11 +164,12 @@ import java.util.*; public class VariantContext implements Feature { // to enable tribble intergration protected InferredGeneticContext commonInfo = null; public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR; - public final static String REFERENCE_BASE_FOR_INDEL_KEY = "_REFERENCE_BASE_FOR_INDEL_"; public final static String UNPARSED_GENOTYPE_MAP_KEY = "_UNPARSED_GENOTYPE_MAP_"; public final static String UNPARSED_GENOTYPE_PARSER_KEY = "_UNPARSED_GENOTYPE_PARSER_"; public final static String ID_KEY = "ID"; + private final Byte REFERENCE_BASE_FOR_INDEL; + public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ @@ -205,6 +207,24 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- + /** + * the complete constructor. Makes a complete VariantContext from its arguments + * + * @param source source + * @param contig the contig + * @param start the start base (one based) + * @param stop the stop reference base (one based) + * @param alleles alleles + * @param genotypes genotypes map + * @param negLog10PError qual + * @param filters filters: use null for unfiltered and empty set for passes filters + * @param attributes attributes + * @param referenceBaseForIndel padded reference base + */ + public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + } + /** * the complete constructor. Makes a complete VariantContext from its arguments * @@ -219,7 +239,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); } /** @@ -239,7 +259,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, true); + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true); } /** @@ -256,7 +276,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, false); + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); } /** @@ -269,7 +289,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles alleles */ public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, false); + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); } /** @@ -292,7 +312,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param other the VariantContext to copy */ public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), false); + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); } /** @@ -307,8 +327,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param negLog10PError qual * @param filters filters: use null for unfiltered and empty set for passes filters * @param attributes attributes + * @param referenceBaseForIndel padded reference base + * @param genotypesAreUnparsed true if the genotypes have not yet been parsed */ - private VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, boolean genotypesAreUnparsed) { + private VariantContext(String source, String contig, long start, long stop, + Collection alleles, Map genotypes, + double negLog10PError, Set filters, Map attributes, + Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; @@ -323,6 +348,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati this.commonInfo = new InferredGeneticContext(source, negLog10PError, filters, attributes); filtersWereAppliedToContext = filters != null; + REFERENCE_BASE_FOR_INDEL = referenceBaseForIndel; if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } @@ -355,23 +381,23 @@ public class VariantContext implements Feature { // to enable tribble intergrati // --------------------------------------------------------------------------------------------------------- public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), false); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); } public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); } public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); } // --------------------------------------------------------------------------------------------------------- @@ -414,7 +440,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); } @@ -603,6 +629,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return (String)commonInfo.getAttribute(ID_KEY); } + public boolean hasReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL != null; + } + + // the indel base that gets stripped off for indels + public Byte getReferenceBaseForIndel() { + return REFERENCE_BASE_FOR_INDEL; + } + // --------------------------------------------------------------------------------------------------------- // // get routines to access context info fields @@ -1020,11 +1055,12 @@ public class VariantContext implements Feature { // to enable tribble intergrati * Run all extra-strict validation tests on a Variant Context object * * @param reference the true reference allele + * @param paddedRefBase the reference base used for padding indels * @param rsIDs the true dbSNP IDs */ - public void extraStrictValidation(Allele reference, Set rsIDs) { + public void extraStrictValidation(Allele reference, Byte paddedRefBase, Set rsIDs) { // validate the reference - validateReferenceBases(reference); + validateReferenceBases(reference, paddedRefBase); // validate the RS IDs validateRSIDs(rsIDs); @@ -1039,11 +1075,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati //checkReferenceTrack(); } - public void validateReferenceBases(Allele reference) { + public void validateReferenceBases(Allele reference, Byte paddedRefBase) { // don't validate if we're an insertion if ( !reference.isNull() && !reference.basesMatch(getReference()) ) { throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString())); } + + // we also need to validate the padding base for simple indels + if ( hasReferenceBaseForIndel() && !getReferenceBaseForIndel().equals(paddedRefBase) ) + throw new TribbleException.InternalCodecException(String.format("the padded REF base is incorrect for the record at position %s:%d, %s vs. %s", getChr(), getStart(), (char)getReferenceBaseForIndel().byteValue(), (char)paddedRefBase.byteValue())); } public void validateRSIDs(Set rsIDs) { @@ -1151,6 +1191,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati private boolean validate(boolean throwException) { try { + validateReferencePadding(); validateAlleles(); validateGenotypes(); } catch ( IllegalArgumentException e ) { @@ -1163,6 +1204,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati return true; } + private void validateReferencePadding() { + boolean needsPadding = hasSymbolicAlleles() || (getReference().length() == getEnd() - getStart()); // off by one because padded base was removed + + if ( needsPadding && !hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + getChr() + ":" + getStart() + "; no padded reference base was provided."); + } + private void validateAlleles() { // check alleles boolean alreadySeenRef = false, alreadySeenNull = false; @@ -1221,16 +1269,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati // // --------------------------------------------------------------------------------------------------------- - // the indel base that gets stripped off for indels - public boolean hasReferenceBaseForIndel() { - return hasAttribute(REFERENCE_BASE_FOR_INDEL_KEY); - } - - // the indel base that gets stripped off for indels - public byte getReferenceBaseForIndel() { - return hasReferenceBaseForIndel() ? (Byte)getAttribute(REFERENCE_BASE_FOR_INDEL_KEY) : (byte)'N'; - } - private void determineType() { if ( type == null ) { switch ( getNAlleles() ) { @@ -1357,8 +1395,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati return false; } - public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, byte inputRefBase, boolean refBaseShouldBeAppliedToEndOfAlleles) { - Allele refAllele = inputVC.getReference(); + public static VariantContext createVariantContextWithPaddedAlleles(VariantContext inputVC, boolean refBaseShouldBeAppliedToEndOfAlleles) { // see if we need to pad common reference base from all alleles boolean padVC; @@ -1368,31 +1405,20 @@ public class VariantContext implements Feature { // to enable tribble intergrati long locLength = (inputVC.getEnd() - inputVC.getStart()) + 1; if (inputVC.hasSymbolicAlleles()) padVC = true; - else if (refAllele.length() == locLength) + else if (inputVC.getReference().length() == locLength) padVC = false; - else if (refAllele.length() == locLength-1) + else if (inputVC.getReference().length() == locLength-1) padVC = true; else throw new IllegalArgumentException("Badly formed variant context at location " + String.valueOf(inputVC.getStart()) + " in contig " + inputVC.getChr() + ". Reference length must be at most one base shorter than location size"); - // nothing to do if we don't need to pad bases if (padVC) { - Byte refByte; - Map attributes = inputVC.getAttributes(); + if ( !inputVC.hasReferenceBaseForIndel() ) + throw new ReviewedStingException("Badly formed variant context at location " + inputVC.getChr() + ":" + inputVC.getStart() + "; no padded reference base is available."); - // upper-case for consistency; note that we can safely make these casts because the input is constrained to be a byte - inputRefBase = (byte)Character.toUpperCase((char)inputRefBase); - if (attributes.containsKey(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY)) - refByte = (Byte)attributes.get(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY); - else if (inputRefBase == 'A' || inputRefBase == 'T' || inputRefBase == 'C' || inputRefBase == 'G' || inputRefBase == 'N') - refByte = inputRefBase; - else - throw new IllegalArgumentException("Error when trying to pad Variant Context at location " + String.valueOf(inputVC.getStart()) - + " in contig " + inputVC.getChr() + - ". Either input reference base ("+(char)inputRefBase+ - ", ascii code="+inputRefBase+") must be a regular base, or input VC must contain reference base key"); + Byte refByte = inputVC.getReferenceBaseForIndel(); List alleles = new ArrayList(); Map genotypes = new TreeMap(); @@ -1444,11 +1470,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati // Do not change the filter state if filters were not applied to this context Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), - inputVCFilters, attributes); - - - + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes()); } else return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 5a5671056..7d10749ee 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -289,22 +289,19 @@ public class VariantContextUtils { /** * Returns a newly allocated VC that is the same as VC, but without genotypes - * @param vc - * @return + * @param vc variant context + * @return new VC without genotypes */ @Requires("vc != null") @Ensures("result != null") public static VariantContext sitesOnlyVariantContext(VariantContext vc) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), - vc.getAlleles(), vc.getNegLog10PError(), - vc.filtersWereApplied() ? vc.getFilters() : null, - vc.getAttributes()); + return VariantContext.modifyGenotypes(vc, null); } /** * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes - * @param vcs - * @return + * @param vcs collection of VCs + * @return new VCs without genotypes */ @Requires("vcs != null") @Ensures("result != null") @@ -362,9 +359,9 @@ public class VariantContextUtils { * information per genotype. The master merge will add the PQ information from each genotype record, where * appropriate, to the master VC. * - * @param unsortedVCs - * @param masterName - * @return + * @param unsortedVCs collection of VCs + * @param masterName name of master VC + * @return master-merged VC */ public static VariantContext masterMerge(Collection unsortedVCs, String masterName) { VariantContext master = findMaster(unsortedVCs, masterName); @@ -435,22 +432,43 @@ public class VariantContextUtils { * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name * - * @param unsortedVCs - * @param priorityListOfVCs - * @param filteredRecordMergeType - * @param genotypeMergeOptions - * @return + * @param genomeLocParser loc parser + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param inputRefBase the ref base + * @return new VariantContext */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, boolean annotateOrigin, boolean printMessages, byte inputRefBase ) { - return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, inputRefBase, "set", false, false); + return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, "set", false, false); } + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param genomeLocParser loc parser + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext + */ public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, - boolean annotateOrigin, boolean printMessages, byte inputRefBase, String setKey, + boolean annotateOrigin, boolean printMessages, String setKey, boolean filteredAreUncalled, boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -468,9 +486,9 @@ public class VariantContextUtils { for (VariantContext vc : prepaddedVCs) { // also a reasonable place to remove filtered calls, if needed if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc,inputRefBase,false)); + VCs.add(VariantContext.createVariantContextWithPaddedAlleles(vc, false)); } - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredareUncalled + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; // establish the baseline info from the first VC @@ -615,6 +633,17 @@ public class VariantContextUtils { return merged; } + public static Map> separateVariantContextsByType(Collection VCs) { + HashMap> mappedVCs = new HashMap>(); + for ( VariantContext vc : VCs ) { + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(vc); + } + + return mappedVCs; + } + private static class AlleleMapper { private VariantContext vc = null; private Map map = null; @@ -834,6 +863,7 @@ public class VariantContextUtils { /** * create a genome location, given a variant context + * @param genomeLocParser parser * @param vc the variant context * @return the genomeLoc */ diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index bea9eaec5..0194e114a 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -47,6 +47,7 @@ public class MD5DB { /** * Subdirectory under the ant build directory where we store integration test md5 results */ + private static final int MAX_RECORDS_TO_READ = 10000; public static final String LOCAL_MD5_DB_DIR = "integrationtests"; public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; @@ -78,8 +79,8 @@ public class MD5DB { * @return */ public static String getMD5FilePath(final String md5, final String valueIfNotFound) { - // we prefer the local db to the global DB, so match it first - for ( String dir : Arrays.asList(LOCAL_MD5_DB_DIR, GLOBAL_MD5_DB_DIR)) { + // we prefer the global db to the local DB, so match it first + for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { File f = getFileForMD5(md5, dir); if ( f.exists() && f.canRead() ) return f.getPath(); @@ -232,7 +233,7 @@ public class MD5DB { // inline differences DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(System.out, 20, 10, 0); - boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), params); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); if ( success ) System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R public/testdata/exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", pathToExpectedMD5File, pathToFileMD5File); diff --git a/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java new file mode 100644 index 000000000..a6af034cb --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java @@ -0,0 +1,27 @@ +package org.broadinstitute.sting.alignment; + +import org.testng.annotations.Test; +import org.broadinstitute.sting.WalkerTest; + +import java.util.Arrays; + +/** + * Integration tests for the aligner. + * + * @author mhanna + * @version 0.1 + */ +public class AlignerIntegrationTest extends WalkerTest { + @Test + public void testBasicAlignment() { + String md5 = "a2bdf907b18114a86ca47f9fc23791bf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + GATKDataLocation + "bwa/human_b36_both.fasta" + + " -T Align" + + " -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testBasicAlignment", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java new file mode 100644 index 000000000..02e1ba99a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.report; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class GATKReportUnitTest extends BaseTest { + @Test + public void testParse() throws Exception { + String reportPath = validationDataLocation + "exampleGATKReport.eval"; + GATKReport report = new GATKReport(reportPath); + + GATKReportTable countVariants = report.getTable("CountVariants"); + Assert.assertEquals(countVariants.getVersion(), GATKReportVersion.V0_1); + Object countVariantsPK = countVariants.getPrimaryKey("none.eval.none.all"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nProcessedLoci"), "100000"); + Assert.assertEquals(countVariants.get(countVariantsPK, "nNoCalls"), "99872"); + + GATKReportTable validationReport = report.getTable("ValidationReport"); + Assert.assertEquals(validationReport.getVersion(), GATKReportVersion.V0_1); + Object validationReportPK = countVariants.getPrimaryKey("none.eval.none.known"); + Assert.assertEquals(validationReport.get(validationReportPK, "sensitivity"), "NaN"); + + GATKReportTable simpleMetricsByAC = report.getTable("SimpleMetricsByAC.metrics"); + Assert.assertEquals(simpleMetricsByAC.getVersion(), GATKReportVersion.V0_1); + Object simpleMetricsByACPK = simpleMetricsByAC.getPrimaryKey("none.eval.none.novel.ac2"); + Assert.assertEquals(simpleMetricsByAC.get(simpleMetricsByACPK, "AC"), "2"); + + Assert.assertFalse(simpleMetricsByAC.containsPrimaryKey("none.eval.none.novel.ac2.bad")); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java deleted file mode 100755 index c75a5b2dc..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/genomicannotator/GenomicAnnotatorIntegrationTest.java +++ /dev/null @@ -1,83 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator.genomicannotator; - - -import java.util.Arrays; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -public class GenomicAnnotatorIntegrationTest extends WalkerTest { - String testFileWithIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.cleaned.indels.vcf"; - String testFileWithSNPsAndIndels = validationDataLocation + "/GenomicAnnotatorValidation/1KGBroadWEx.variants.vcf"; - - @Test - public void testGenomicAnnotatorOnDbSNP() { - - /* - TODO put this test back in once it gets faster. - String[] md5 = {"d19d6d1eb52fb09e7493653dc645d92a"}; - WalkerTestSpec spec = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + " " + - "-B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf " + - "-B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt " + - "-m " + //generate many records from one input record if necessary - "-o %s " + - "-BTI variant", - 1, - Arrays.asList(md5)); - executeTest("test with dbSNP", spec); - */ - - - String[] md5WithDashSArg = {"efba4ce1641cfa2ef88a64395f2ebce8"}; - WalkerTestSpec specWithSArg = new WalkerTestSpec( - "-T GenomicAnnotator -R " + b36KGReference + - " -B:variant,vcf3 /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" + - " -B:dbsnp,AnnotatorInputTable /humgen/gsa-hpprojects/GATK/data/Annotations/dbsnp/b130/snp130-b36-only-the-SNPs.txt" + - " -m" + //generate many records from one input record if necessary - " -o %s" + - " -BTI variant" + - " -s dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet" + - " -NO_HEADER", - 1, - Arrays.asList(md5WithDashSArg)); - executeTest("test with dbSNP and -s arg", specWithSArg); - - } - - @Test - public void testGenomicAnnotatorOnIndels() { - WalkerTestSpec testOnIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("772fc3f43b70770ec6c6acbb8bbbd4c0") - ); - executeTest("testGenomicAnnotatorOnIndels", testOnIndels); - } - - @Test - public void testGenomicAnnotatorOnSNPsAndIndels() { - WalkerTestSpec testOnSNPsAndIndels = new WalkerTestSpec( - buildCommandLine( - "-T GenomicAnnotator", - "-R " + b37KGReference, - "-L 22:10000000-20000000", - "-B:refseq,AnnotatorInputTable " + b37Refseq, - "-B:variant,VCF " + testFileWithSNPsAndIndels, - "-NO_HEADER", - "-o %s" - ), - 1, - Arrays.asList("081ade7f3d2d3c5f19cb1e8651a626f3") - ); - executeTest("testGenomicAnnotatorOnSNPsAndIndels", testOnSNPsAndIndels); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index cca1eccb4..f9aaaecc1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -30,8 +30,6 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; -import java.util.Collections; -import java.util.List; public class DiffObjectsIntegrationTest extends WalkerTest { private class TestParams extends TestDataProvider { @@ -52,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "fb7f4e011487ca56bce865ae5468cdc5"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "423cec3befbf0a72d8bc3757ee628fc4"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "92311de76dda3f38aac289d807ef23d0"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "0c69412c385fda50210f2a612e1ffe4a"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1f23d262e..88c5116b1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("c97829259463d04b0159591bb6fb44af")); + Arrays.asList("16b0c7b47745abcd1ddaa2e261719530")); executeTest("test MultiSample Pilot1", spec); } @@ -54,12 +54,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("2b69667f4770e8c0c894066b7f27e440")); + Arrays.asList("811ddc0bd8322b14f14f58df8c627aa9")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("b77fe007c2a97fcd59dfd5eef94d8b95")); + Arrays.asList("5cf08dd7ac3d218082f7be3915ce0b15")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -67,7 +67,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ee8a5e63ddd470726a749e69c0c20f60")); + Arrays.asList("75156264696563c2f47620fef9424f7c")); executeTest("test SingleSample Pilot2", spec); } @@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "ef31654a2b85b9b2d3bba4f4a75a17b6"; + private final static String COMPRESSED_OUTPUT_MD5 = "7255e03430549cb97d8fcae34cbffb02"; @Test public void testCompressedOutput() { @@ -107,7 +107,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "46868a9c4134651c54535fb46b408aee"; + String md5 = "7912109e83fda21dae90ef8d5dd0140d"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -138,9 +138,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "5043c9a101e691602eb7a3f9704bdf20" ); - e.put( "--min_mapping_quality_score 26", "71a833eb8fd93ee62ae0d5a430f27940" ); - e.put( "--p_nonref_model GRID_SEARCH", "ddf443e9dcadef367476b26b4d52c134" ); + e.put( "--min_base_quality_score 26", "6d3aa9f783ca63f37c952f83eeda593c" ); + e.put( "--min_mapping_quality_score 26", "51bfdf777123bf49de5d92ffde5c74e7" ); + e.put( "--p_nonref_model GRID_SEARCH", "333328ab2c8da2875fade599e80a271f" ); + e.put( "--computeSLOD", "226caa28a4fa9fe34f3beb8a23f3d53d" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -153,9 +154,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "eaad6ceb71ab94290650a70bea5ab951" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "05bf7db8a3d19ef4a3d14772c90b732f" ); - e.put( "--output_mode EMIT_ALL_SITES", "e4b86740468d7369f0156550855586c7" ); + e.put( "-sites_only", "5f659dee408710d3709ed72005cd863a" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "55d09bf13149bddc06cc36be0801507b" ); + e.put( "--output_mode EMIT_ALL_SITES", "727f49dcb2439b18446829efc3b1561c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -169,12 +170,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("71a833eb8fd93ee62ae0d5a430f27940")); + Arrays.asList("51bfdf777123bf49de5d92ffde5c74e7")); executeTest("test confidence 1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("79968844dc3ddecb97748c1acf2984c7")); + Arrays.asList("c67c285e70fd4457c9f9ce7bd878ddca")); executeTest("test confidence 2", spec2); } @@ -186,8 +187,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "4e878664f61d2d800146d3762303fde1" ); - e.put( 1.0 / 1850, "9204caec095ff5e63ca21a10b6fab453" ); + e.put( 0.01, "7ecc564d4db97d5932cef2e558550ed2" ); + e.put( 1.0 / 1850, "aa9e101bb9f9e111fe292fec467d915a" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -211,7 +212,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); + Arrays.asList("2efd686186b2c5129be4cf89274a24dd")); executeTest(String.format("test multiple technologies"), spec); } @@ -230,25 +231,11 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("62d0f6d9de344ce68ce121c13b1e78b1")); + Arrays.asList("2892d35331fe9fc141ba19269ec7caed")); executeTest(String.format("test calling with BAQ"), spec); } - @Test - public void testCallingWithBAQOff() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + - " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" + - " -o %s" + - " -L 1:10,000,000-10,100,000" + - " -baq OFF", - 1, - Arrays.asList("1a58ec52df545f946f80cc16c5736a91")); - - executeTest(String.format("test calling with BAQ OFF"), spec); - } - // -------------------------------------------------------------------------------------------------------------- // // testing indel caller @@ -263,7 +250,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("631ae1f1eb6bc4c1a4136b8495250536")); + Arrays.asList("8c2afb4289ed44521933d1a74c8d6c7f")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -278,7 +265,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("fd556585c79e2b892a5976668f45aa43")); + Arrays.asList("b6fb70590a10e1c27fb611732916f27d")); executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); } @@ -291,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("9cd56feedd2787919e571383889fde70")); + Arrays.asList("61642502bd08cc03cdaaeb83a5426b46")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -301,14 +288,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("315e1b78d7a403d7fcbcf0caa8c496b8")); + Arrays.asList("69b0b3f089c80b9864294d838a061336")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -B:alleles,vcf " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("cf89e0c54f14482a23c105b73a333d8a")); + Arrays.asList("c90174cfd7dd68bdef36fe2c60145e10")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 2676f7067..19dc99682 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -32,13 +32,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("test realigner defaults with VCF", spec2); - - WalkerTestSpec spec3 = new WalkerTestSpec( - baseCommand + "-D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5)); - executeTest("realigner defaults with dbsnp", spec3); - } @Test @@ -48,12 +41,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList("3dd5d2c9931b375455af0bff1a2c4888")); executeTest("realigner known indels only from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel KNOWNS_ONLY -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList("05a114623c126b0398fbc1703437461e")); - executeTest("realigner known indels only from dbsnp", spec2); } @Test @@ -63,12 +50,6 @@ public class IndelRealignerIntegrationTest extends WalkerTest { 1, Arrays.asList(base_md5_with_SW_or_VCF)); executeTest("realigner use SW from VCF", spec1); - - WalkerTestSpec spec2 = new WalkerTestSpec( - baseCommand + "--consensusDeterminationModel USE_SW -D " + GATKDataLocation + "dbsnp_129_b36.rod", - 1, - Arrays.asList(base_md5_with_SW_or_VCF)); - executeTest("realigner use SW from dbsnp", spec2); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java index fd5ad0b22..e8b5033cf 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerPerformanceTest.java @@ -30,7 +30,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-5,650,000" + @@ -45,7 +45,7 @@ public class IndelRealignerPerformanceTest extends WalkerTest { " -LOD 5" + " -maxConsensuses 100" + " -greedy 100" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -o /dev/null" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L chr1:1-150,000,000" + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index 4b225aaea..60312dbd2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -11,15 +11,15 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { public void testIntervals() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s", 1, Arrays.asList("e7accfa58415d6da80383953b1a3a986")); executeTest("test standard", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + "-T RealignerTargetCreator -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_129_b36.vcf -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); + Arrays.asList("0367d39a122c8ac0899fb868a82ef728")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java index 0b6694fd9..9490206c8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorPerformanceTest.java @@ -12,7 +12,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec( "-R " + hg18Reference + " -T RealignerTargetCreator" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -I " + evaluationDataLocation + "NA12878.GAII.chr1.50MB.bam" + " -L chr1:1-50,000,000" + " -o /dev/null", @@ -23,7 +23,7 @@ public class RealignerTargetCreatorPerformanceTest extends WalkerTest { WalkerTestSpec spec2 = new WalkerTestSpec( "-R " + hg18Reference + " -T RealignerTargetCreator" + - " -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_hg18.rod" + + " -B:dbsnp,vcf " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -I " + evaluationDataLocation + "NA12878.ESP.WEx.chr1.bam" + " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -o /dev/null", diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index 9f59adeb6..f62f12082 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -10,24 +10,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { private static String fundamentalTestVCF = phaseByTransmissionTestDataRoot + "/" + "FundamentalsTest.unfiltered.vcf"; @Test - public void testBasicFunctionalityWithoutFilters() { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T PhaseByTransmission", - "-R " + b37KGReference, - "-B:variant,VCF " + fundamentalTestVCF, - "-f NA12892+NA12891=NA12878", - "-nofilters", - "-o %s" - ), - 1, - Arrays.asList("416a483e87358cdcb0b09a496e3254c0") - ); - executeTest("testBasicFunctionalityWithoutFilters", spec); - } - - @Test - public void testBasicFunctionalityWithFilters() { + public void testBasicFunctionality() { WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( "-T PhaseByTransmission", @@ -37,8 +20,8 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("8c5db343567e90e97993912c7e541d0d") + Arrays.asList("45fef0e23113e2fcd9570379e2fc1b75") ); - executeTest("testBasicFunctionalityWithFilters", spec); + executeTest("testBasicFunctionality", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 129161da3..e81d2670c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -18,10 +18,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariates1() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "7b5832d4b2a23b8ef2bb639eb59bfa88" ); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "9c006f8e9fb5752b1c139f5a8cc7ea88"); - e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "e6f7b4ab9aa291022e0ba8b7dbe4c77e" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e6b98af01c5a08e4954b79ec42db6fc3" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "5a52b00d9794d27af723bcf93366681e" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "17d4b8001c982a70185e344929cf3941"); + e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "714e65d6cb51ae32221a77ce84cbbcdc" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "64e9f17a1cf6fc04c1f2717c2d2eca67" ); for ( String parallelism : Arrays.asList("", " -nt 4")) { for ( Map.Entry entry : e.entrySet() ) { @@ -30,7 +30,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_129_b36.vcf" + " -T CountCovariates" + " -I " + bam + ( bam.equals( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam" ) @@ -52,10 +52,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibrator1() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "6797d7ffa4ef6c48413719ba32696ccf"); - e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "2bb3374dde131791d7638031ae3b3e10" ); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "1f9d8944b73169b367cb83b0d22e5432" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "c164dd635721ba6df3f06dac1877c32d"); + e.put( validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "74314e5562c1a65547bb0edaacffe602" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "2a37c6001826bfabf87063b1dfcf594f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -83,7 +83,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "3404965ec4fa99873fe6a44521944fd5"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "278846c55d97bd9812b758468a83f559"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -97,7 +97,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -standard" + " -OQ" + " -recalFile %s" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod", + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_129_b36.vcf", 1, // just one output file Arrays.asList(md5)); executeTest("testCountCovariatesUseOriginalQuals", spec); @@ -107,7 +107,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorMaxQ70() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "0278cce4cfdab869dc0c11d6852a984b" ); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "2864f231fab7030377f3c8826796e48f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -136,7 +136,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "c9ea5f995e1e2b7a5688533e678dcedc" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "8379f24cf5312587a1f92c162ecc220f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -144,7 +144,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_129_b36.vcf" + " -T CountCovariates" + " -I " + bam + " -standard" + @@ -162,7 +162,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorSolidIndelsRemoveRefBias() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "993fae4270e7e1e15986f270acf247af" ); + e.put( validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "7d5edb75b176e4151de225f699719ee4" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -238,7 +238,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesVCFPlusDBsnp() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "a3d892bd60d8f679affda3c1e3af96c1"); + e.put( validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam", "9131d96f39badbf9753653f55b148012"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -249,7 +249,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -B:anyNameABCD,VCF3 " + validationDataLocation + "vcfexample3.vcf" + " -T CountCovariates" + " -I " + bam + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_129_b36.vcf" + " -L 1:10,000,000-10,200,000" + " -cov ReadGroupCovariate" + " -cov QualityScoreCovariate" + @@ -263,10 +263,11 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } + @Test public void testCountCovariatesNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "284ccac1f8fe485e52c86333cac7c2d4" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "8993d32df5cb66c7149f59eccbd57f4c" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -274,7 +275,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-R " + b36KGReference + - " --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_129_b36.vcf" + " -T CountCovariates" + " -I " + bam + " -cov ReadGroupCovariate" + @@ -292,7 +293,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testTableRecalibratorNoIndex() { HashMap e = new HashMap(); - e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "c167799c2d9cab815d7c9b23337f162e" ); + e.put( validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.noindex.bam", "5f913c98ca99754902e9d34f99df468f" ); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -315,7 +316,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { } } } - + @Test public void testCountCovariatesFailWithoutDBSNP() { HashMap e = new HashMap(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java index ade34c964..43ea401f7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersPerformanceTest.java @@ -16,7 +16,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L chr1:1-50,000,000" + " -standard" + " -OQ" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); @@ -31,7 +31,7 @@ public class RecalibrationWalkersPerformanceTest extends WalkerTest { " -L " + evaluationDataLocation + "whole_exome_agilent_designed_120.targets.chr1.interval_list" + " -standard" + " -OQ" + - " --DBSNP " + GATKDataLocation + "dbsnp_129_hg18.rod" + + " -B:dbsnp,VCF " + GATKDataLocation + "dbsnp_132.hg18.vcf" + " -recalFile /dev/null" + moreArgs, 0, new ArrayList(0)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java deleted file mode 100755 index 850a3113e..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbesIntegrationTest.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.sequenom; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class PickSequenomProbesIntegrationTest extends WalkerTest { - @Test - public void testProbes() { - String testVCF = validationDataLocation + "complexExample.vcf4"; - String testArgs = "-R " + b36KGReference + " -T PickSequenomProbes -L 1:10,000,000-11,000,000 -B:input,VCF "+testVCF+" -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, - Arrays.asList("6b5409cc78960f1be855536ed89ea9dd")); - executeTest("Test probes", spec); - } - - @Test - public void testProbesUsingDbSNPMask() { - - String md5 = "46d53491af1d3aa0ee1f1e13d68b732d"; - String testVCF = validationDataLocation + "pickSeqIntegrationTest.vcf"; - - String testArgs = "-snp_mask " + validationDataLocation + "pickSeqIntegrationTest.bed -R " - + b36KGReference + " -omitWindow -nameConvention " - + "-project_id 1kgp3_s4_lf -T PickSequenomProbes -B:input,VCF "+testVCF+" -o %s"; - WalkerTestSpec spec1 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5)); - executeTest("Test probes", spec1); - - testArgs += " -nmw 1"; - WalkerTestSpec spec2 = new WalkerTestSpec(testArgs, 1, Arrays.asList(md5)); - executeTest("Test probes", spec2); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java new file mode 100755 index 000000000..95f4ac0ae --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.gatk.walkers.validation; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: Ghost + * Date: 7/19/11 + * Time: 7:39 PM + * To change this template use File | Settings | File Templates. + */ +public class ValidationAmpliconsIntegrationTest extends WalkerTest { + + @Test(enabled=true) + public void testWikiExample() { + String siteVCF = validationDataLocation + "sites_to_validate.vcf"; + String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; + String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; + testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + testArgs += " --virtualPrimerSize 30"; + WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, + Arrays.asList("27f9450afa132888a8994167f0035fd7")); + executeTest("Test probes", spec); + } + + @Test(enabled=true) + public void testWikiExampleNoBWA() { + String siteVCF = validationDataLocation + "sites_to_validate.vcf"; + String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; + String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; + testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + testArgs += " --virtualPrimerSize 30 --doNotUseBWA"; + WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, + Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1")); + executeTest("Test probes", spec); + } + + @Test(enabled=true) + public void testWikiExampleMonoFilter() { + String siteVCF = validationDataLocation + "sites_to_validate.vcf"; + String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; + String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; + String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons -B:ValidateAlleles,VCF "+siteVCF+" -o %s"; + testArgs += " -B:ProbeIntervals,table "+intervalTable+" -BTI ProbeIntervals -B:MaskAlleles,VCF "+maskVCF; + testArgs += " --virtualPrimerSize 30 --filterMonomorphic"; + WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, + Arrays.asList("77b3f30e38fedad812125bdf6cf3255f")); + executeTest("Test probes", spec); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 23c606ad0..3eeabdc5b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -4,8 +4,6 @@ import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; public class VariantEvalIntegrationTest extends WalkerTest { private static String variantEvalTestDataRoot = validationDataLocation + "/VariantEval"; @@ -45,7 +43,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("48b8417c1f8bd74ff7b9808580abd2a2") + Arrays.asList("bced1842c78fbabb089dd12b7087050d") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -66,7 +64,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("86d45ecefdf5849c55b3ca8f82a3d525") + Arrays.asList("06510bd37ffaa39e817ca0dcaf8f8ac2") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -88,7 +86,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("3d18901ec1766aa2e748eac913f5ddcd") + Arrays.asList("19c5b1b6396921c5b1059a2849ae4fcc") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -109,7 +107,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("677fe398643e62a10d6739d36a720a12") + Arrays.asList("a71f8d81cf166cd97ac628092650964a") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -130,7 +128,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("5fb44fd7cb00941c986a9941e43e44cd") + Arrays.asList("4dabe0658232f6174188515db6dfe112") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -151,7 +149,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("daaca7ef3b7313e5af217cbc6f37c9e2") + Arrays.asList("3340587f10ceff83e5567ddfd1a9a60e") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -172,7 +170,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("97c466f8ffd0fcf2c30ef08669d213d9") + Arrays.asList("c730c7ee31c8138cef6efd8dd04fbbfc") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -195,7 +193,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("df8cdfcf3d0c2fc795812c6eae6a76f8") + Arrays.asList("2559ca8f454b03e81561f6947f79df18") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -220,7 +218,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("c7aed12265e2b2311d17a0cc8a29f6aa") + Arrays.asList("23aa5f97641d2fd033095f21c51d2f37") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -239,7 +237,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("d44c8f44384189a09eea85a8e89d7299") + Arrays.asList("a69dd3f06903b3f374c6d6f010c653e0") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -249,7 +247,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { String extraArgs = "-L 1:1-10,000,000"; for (String tests : testsEnumerations) { WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("cdbe47ea01b9dd79ff1c5ce6f5fa8bec")); + 1, Arrays.asList("db95c8af8ba549d38ca6741a59fd6892")); executeTestParallel("testSelect1", spec); } } @@ -260,14 +258,14 @@ public class VariantEvalIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(cmdRoot + " -ST CpG -B:eval,VCF3 " + validationDataLocation + vcfFile + " -B:comp,VCF3 " + validationDataLocation + "GenotypeConcordanceComp.vcf -noEV -EV GenotypeConcordance -o %s", 1, - Arrays.asList("e4c981f7f5d78680c71310fc9be9a1c1")); + Arrays.asList("96f27163f16bb945f19c6623cd6db34e")); executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec); } @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance -B:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf -B:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("162daa5039e1965eb2423a8589339a69")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d1932be3748fcf6da77dc51aec323710")); executeTestParallel("testCompVsEvalAC",spec); } @@ -278,14 +276,14 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testTranches() { String extraArgs = "-T VariantEval -R "+ hg18Reference +" -B:eval,vcf " + validationDataLocation + "GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.optimized.vcf -o %s -EV TiTvVariantEvaluator -L chr1 -noEV -ST CpG -tf " + testDir + "tranches.6.txt"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("90cd98044e754b80034a9f4e6d2c55b9")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("984df6e94a546294fc7e0846cbac2dfe")); executeTestParallel("testTranches",spec); } @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals -B:comphapmap,vcf " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf -B:eval,vcf " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("70aa420929de7f888a6f48c2d01bbcda")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("462d4784dd55294ef9d5118217b157a5")); executeTestParallel("testCompOverlap",spec); } @@ -299,7 +297,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -D " + dbsnp + " -B:evalBI,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5b1fc9a4066aca61f1b5f7b933ad37d9")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("61c36fb6cc75172e2b22a44edeae85e0")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -313,7 +311,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -B:evalBI,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -B:evalBC,VCF " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("6d902d9d4d8fef5219a43e416a51cee6")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("79089484097614b7ab81bbc3ad3a892a")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -330,13 +328,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("55a1c53bced20701c56accfc3eb782a7")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("9f906c04a4553d649b51ae67e0a25113")); executeTestParallel("testMultipleCompTracks",spec); } @Test public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "454a1750fd36525f24172b21af5f49de"; + String md5 = "97a16a99a43d2384cfabc39d36647419"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( @@ -391,7 +389,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("bf324e4c87fe0d21170fcd2a67a20371") + Arrays.asList("44464fe7c89a56cf128a932ef640f7da") ); executeTest("testAlleleCountStrat", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 00ee44f75..9b152bc71 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -80,9 +80,9 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f", false); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "20163d60f18a46496f6da744ab5cc0f9", false); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "5b82f37df1f5ba40f0474d71c94142ec", false); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "f1cf095c2fe9641b7ca1f8ee2c46fd4a", false); } - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "c58dca482bf97069eac6d9f1a07a2cba", false); } + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083", false); } @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c", true); } @@ -100,7 +100,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("8b78339ccf7a5a5a837f79e88a3a38e5")); + Arrays.asList("1de95f91ca15d2a8856de35dee0ce33e")); executeTest("threeWayWithRefs", spec); } @@ -120,6 +120,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestFull() { combineComplexSites("", "b5a53ee92bdaacd2bb3327e9004ae058"); } @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f72a178137e25dbe0b931934cdc0079d"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f704caeaaaed6711943014b847fe381a"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java new file mode 100644 index 000000000..da6277242 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariantsIntegrationTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests LeftAlignVariants + */ +public class LeftAlignVariantsIntegrationTest extends WalkerTest { + + @Test + public void testLeftAlignment() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LeftAlignVariants -o %s -R " + b37KGReference + " -B:variant,vcf " + validationDataLocation + "forLeftAlignVariantsTest.vcf -NO_HEADER", + 1, + Arrays.asList("158b1d71b28c52e2789f164500b53732")); + executeTest("test left alignment", spec); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index d7efe4212..d396e5167 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" + Arrays.asList("ea09bf764adba9765b99921c5ba2c709") ); executeTest("testVCFStreamingChain", selectTestSpec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java old mode 100644 new mode 100755 index 72647c8e1..1db712353 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -44,7 +44,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { @Test(enabled = true) public void testComplexVariantsToTable() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableCmd(" -AMD"), - Arrays.asList("b2a3712c1bfad8f1383ffada8b5017ba")); + Arrays.asList("e8f771995127b727fb433da91dd4ee98")); executeTest("testComplexVariantsToTable", spec).getFirst(); } diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java index aa6303a6f..77db34cbc 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java @@ -34,7 +34,6 @@ import org.testng.annotations.Test; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.*; -import javax.jws.soap.SOAPBinding; import java.io.File; /** @@ -55,25 +54,25 @@ public class LibBatIntegrationTest extends BaseTest { @Test public void testReadConfEnv() { - LibLsf.config_param[] unitsParam = (LibLsf.config_param[]) new LibLsf.config_param().toArray(4); + LibLsf.config_param[] configParams = (LibLsf.config_param[]) new LibLsf.config_param().toArray(4); - unitsParam[0].paramName = "LSF_UNIT_FOR_LIMITS"; - unitsParam[1].paramName = "LSF_CONFDIR"; - unitsParam[2].paramName = "MADE_UP_PARAMETER"; + configParams[0].paramName = "LSF_UNIT_FOR_LIMITS"; + configParams[1].paramName = "LSF_CONFDIR"; + configParams[2].paramName = "MADE_UP_PARAMETER"; - Structure.autoWrite(unitsParam); + Structure.autoWrite(configParams); - if (LibLsf.ls_readconfenv(unitsParam[0], null) != 0) { + if (LibLsf.ls_readconfenv(configParams[0], null) != 0) { Assert.fail(LibLsf.ls_sysmsg()); } - Structure.autoRead(unitsParam); + Structure.autoRead(configParams); - System.out.println("LSF_UNIT_FOR_LIMITS: " + unitsParam[0].paramValue); - Assert.assertNotNull(unitsParam[1].paramValue); - Assert.assertNull(unitsParam[2].paramValue); - Assert.assertNull(unitsParam[3].paramName); - Assert.assertNull(unitsParam[3].paramValue); + System.out.println("LSF_UNIT_FOR_LIMITS: " + configParams[0].paramValue); + Assert.assertNotNull(configParams[1].paramValue); + Assert.assertNull(configParams[2].paramValue); + Assert.assertNull(configParams[3].paramName); + Assert.assertNull(configParams[3].paramValue); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 68a2ecf8d..d08cda949 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -70,7 +70,7 @@ public class IndexFactoryUnitTest { CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { VariantContext vc = it.next(); - writer.add(vc, vc.getReferenceBaseForIndel()); + writer.add(vc); } writer.close(); diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 34a2e616a..e3a926fb9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -57,8 +57,8 @@ public class VCFWriterUnitTest extends BaseTest { VCFHeader header = createFakeHeader(metaData,additionalColumns); VCFWriter writer = new StandardVCFWriter(fakeVCFFile); writer.writeHeader(header); - writer.add(createVC(header),"A".getBytes()[0]); - writer.add(createVC(header),"A".getBytes()[0]); + writer.add(createVC(header)); + writer.add(createVC(header)); writer.close(); VCFCodec reader = new VCFCodec(); AsciiLineReader lineReader; @@ -135,7 +135,7 @@ public class VCFWriterUnitTest extends BaseTest { genotypes.put(name,gt); } - return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes); + return new VariantContext("RANDOM",loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, 0, filters, attributes, (byte)'A'); } diff --git a/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java new file mode 100644 index 000000000..45a618f71 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/text/TextFormattingUtilsUnitTest.java @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.text; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + +public class TextFormattingUtilsUnitTest extends BaseTest { + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitWhiteSpaceNullLine() { + TextFormattingUtils.splitWhiteSpace(null); + } + + @Test + public void testSplitWhiteSpace() { + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz"), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace(" foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("foo bar baz "), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitWhiteSpace("\tfoo\tbar\tbaz\t"), new String[]{"foo", "bar", "baz"}); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testGetWordStartsNullLine() { + TextFormattingUtils.getWordStarts(null); + } + + @Test + public void testGetWordStarts() { + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz"), Arrays.asList(5, 10)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz"), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts(" foo bar baz "), Arrays.asList(1, 5, 9)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("foo bar baz "), Arrays.asList(4, 8)); + Assert.assertEquals(TextFormattingUtils.getWordStarts("\tfoo\tbar\tbaz\t"), Arrays.asList(1, 5, 9)); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullLine() { + TextFormattingUtils.splitFixedWidth(null, Collections.emptyList()); + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testSplitFixedWidthNullColumnStarts() { + TextFormattingUtils.splitFixedWidth("foo bar baz", null); + } + + @Test + public void testSplitFixedWidth() { + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz", Arrays.asList(5, 10)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" foo bar baz ", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("foo bar baz ", Arrays.asList(4, 8)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("\tfoo\tbar\tbaz\t", Arrays.asList(5, 9)), new String[] { "foo", "bar", "baz" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth("f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f o", "b r", "b z" }); + Assert.assertEquals(TextFormattingUtils.splitFixedWidth(" f o b r b z", Arrays.asList(4, 8)), new String[] { "f", "o b", "r b z" }); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java deleted file mode 100644 index 78ab916db..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java +++ /dev/null @@ -1,402 +0,0 @@ -// our package -package org.broadinstitute.sting.utils.threading; - - -// the imports for unit testing. - - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.iterators.GenomeLocusIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.*; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.*; - -/** - * Basic unit test for GenomeLoc - */ -public class GenomeLocProcessingTrackerUnitTest extends BaseTest { - IndexedFastaSequenceFile fasta = null; - GenomeLocParser genomeLocParser = null; - String chr1 = null; - private final static String FILE_ROOT = "public/testdata/GLPTFile"; - - @BeforeTest - public void before() { - File referenceFile = new File(hg18Reference); - try { - fasta = new IndexedFastaSequenceFile(referenceFile); - chr1 = fasta.getSequenceDictionary().getSequence(1).getSequenceName(); - genomeLocParser = new GenomeLocParser(fasta); - - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } - - @BeforeMethod - public void beforeMethod(Object[] data) { - if ( data.length > 0 ) - ((TestTarget)data[0]).init(); - } - - @AfterMethod - public void afterMethod(Object[] data) { - if ( data.length > 0 ) { - ((TestTarget)data[0]).getTracker().close(); - ((TestTarget)data[0]).cleanup(); - } - } - - abstract private class TestTarget { - String name; - int nShards; - int shardSize; - File file; - - public void init() { cleanup(); } - - public void cleanup() { - if ( file != null && file.exists() ) - file.delete(); - } - - public boolean isThreadSafe() { return true; } - - protected TestTarget(String name, int nShards, int shardSize, File file) { - this.name = name; - this.nShards = nShards; - this.shardSize = shardSize; - this.file = file; - } - - public abstract GenomeLocProcessingTracker getTracker(); - - public List getShards() { - List shards = new ArrayList(); - for ( int i = 0; i < nShards; i++ ) { - int start = shardSize * i; - int stop = start + shardSize; - shards.add(genomeLocParser.createGenomeLoc(chr1, start, stop)); - } - return shards; - } - - public String toString() { - return String.format("TestTarget %s: nShards=%d shardSize=%d", name, nShards, shardSize); - } - } - - @DataProvider(name = "threadData") - public Object[][] createThreadData() { - // gotta keep the tests small... - return createData(Arrays.asList(10, 100), Arrays.asList(10)); - //return createData(Arrays.asList(10, 100, 1000, 10000), Arrays.asList(10)); - } - - public Object[][] createData(List nShards, List shardSizes) { - List params = new ArrayList(); - - int counter = 0; - String name = null; - for ( int nShard : nShards ) { - for ( int shardSize : shardSizes ) { - // shared mem -- canonical implementation - params.add(new TestTarget("ThreadSafeSharedMemory", nShard, shardSize, null) { - GenomeLocProcessingTracker tracker = new SharedMemoryGenomeLocProcessingTracker(new ClosableReentrantLock()); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - final File file1 = new File(String.format("%s_ThreadSafeFileBacked_%d_%d", FILE_ROOT, counter++, nShard, shardSize)); - params.add(new TestTarget("ThreadSafeFileBacked", nShard, shardSize, file1) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file1, genomeLocParser, new ClosableReentrantLock(), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - name = "FileBackedSharedFileThreadSafe"; - final File file2 = new File(String.format("%s_%s_%d_%d", FILE_ROOT, name, counter++, nShard, shardSize)); - params.add(new TestTarget(name, nShard, shardSize, file2) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file2, genomeLocParser, new SharedFileThreadSafeLock(file2, -1), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - }); - - name = "FileBackedSharedFile"; - final File file3 = new File(String.format("%s_%s_%d_%d", FILE_ROOT, name, counter++, nShard, shardSize)); - params.add(new TestTarget(name, nShard, shardSize, file3) { - GenomeLocProcessingTracker tracker = new FileBackedGenomeLocProcessingTracker(file3, genomeLocParser, new SharedFileLock(file3, -1), null); - public GenomeLocProcessingTracker getTracker() { return tracker; } - public boolean isThreadSafe() { return false; } - }); - } - } - - List params2 = new ArrayList(); - for ( TestTarget x : params ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @DataProvider(name = "simpleData") - public Object[][] createSimpleData() { - return createData(Arrays.asList(1000), Arrays.asList(100)); - } - - private static final String NAME_ONE = "name1"; - private static final String NAME_TWO = "name2"; - - @Test(enabled = true) - public void testNoop() { - GenomeLocProcessingTracker tracker = new NoOpGenomeLocProcessingTracker(); - for ( int start = 1; start < 100; start++ ) { - for ( int n = 0; n < 2; n++ ) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(chr1, start, start +1); - ProcessingLoc ploc = tracker.claimOwnership(loc, NAME_ONE); - Assert.assertTrue(ploc.isOwnedBy(NAME_ONE)); - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), 0); - } - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testSingleProcessTracker(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testSingleProcessTracker " + test); - - int counter = 0; - for ( GenomeLoc shard : shards ) { - counter++; - - Assert.assertNull(tracker.findOwner(shard, NAME_ONE)); - Assert.assertFalse(tracker.locIsOwned(shard, NAME_ONE)); - - ProcessingLoc proc = tracker.claimOwnership(shard,NAME_ONE); - Assert.assertNotNull(proc); - Assert.assertNotNull(proc.getLocation()); - Assert.assertNotNull(proc.getOwner()); - Assert.assertEquals(proc.getLocation(), shard); - Assert.assertEquals(proc.getOwner(), NAME_ONE); - Assert.assertEquals(tracker.findOwner(shard, NAME_ONE), proc); - Assert.assertTrue(tracker.locIsOwned(shard, NAME_ONE)); - Assert.assertNotNull(tracker.updateAndGetProcessingLocs(NAME_ONE)); - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), counter); - - ProcessingLoc badClaimAttempt = tracker.claimOwnership(shard,NAME_TWO); - Assert.assertFalse(badClaimAttempt.getOwner().equals(NAME_TWO)); - Assert.assertEquals(badClaimAttempt.getOwner(), NAME_ONE); - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testIterator(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testIterator " + test); - - List markedShards = new ArrayList(); - List toFind = new ArrayList(); - - for ( int i = 0; i < shards.size(); i++ ) { - if ( ! (i % 10 == 0) ) { - markedShards.add(shards.get(i)); - tracker.claimOwnership(shards.get(i), NAME_TWO); - } else { - toFind.add(shards.get(i)); - } - } - - int nFound = 0; - Iterator it = shards.iterator(); - while ( it.hasNext() ) { - GenomeLoc shard = tracker.claimOwnershipOfNextAvailable(it, NAME_ONE); - - if ( shard == null ) { // everything to get is done - Assert.assertEquals(nFound, toFind.size(), "Didn't find all of the available shards"); - } else { - nFound++; - ProcessingLoc proc = tracker.findOwner(shard, NAME_ONE); - - Assert.assertTrue(proc.isOwnedBy(NAME_ONE)); - Assert.assertTrue(! markedShards.contains(shard), "Ran process was already marked!"); - Assert.assertTrue(toFind.contains(shard), "Claimed shard wasn't one of the unmarked!"); - } - } - } - - @Test(dataProvider = "simpleData", enabled = true) - public void testMarkedProcesses(TestTarget test) { - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - logger.warn("testMarkedProcesses " + test); - - List markedShards = new ArrayList(); - - for ( int i = 0; i < shards.size(); i++ ) { - if ( i % 2 == 0 ) { - markedShards.add(shards.get(i)); - tracker.claimOwnership(shards.get(i), NAME_TWO); - } - } - - for ( GenomeLoc shard : shards ) { - ProcessingLoc proc = tracker.claimOwnership(shard,NAME_ONE); - - Assert.assertTrue(proc.isOwnedBy(NAME_ONE) || proc.isOwnedBy(NAME_TWO)); - - if ( proc.isOwnedBy(NAME_ONE) ) - Assert.assertTrue(! markedShards.contains(shard), "Ran process was already marked!"); - else - Assert.assertTrue(markedShards.contains(shard), "Unran process wasn't marked"); - - if ( ! markedShards.contains(shard) ) { - Assert.assertEquals(tracker.findOwner(shard, NAME_ONE), proc); - } - } - } - - public class TestThread implements Callable { - public TestTarget test; - public String name; - public List ran, toRun; - boolean useIterator; - - public TestThread(TestTarget test, int count, List toRun, boolean useIterator) { - this.test = test; - this.toRun = toRun; - this.name = "thread" + count; - this.ran = new ArrayList(); - this.useIterator = useIterator; - } - - public Integer call() { - //logger.warn(String.format("Call() Thread %s", name)); - if ( useIterator ) { - for ( GenomeLoc shard : test.getTracker().onlyOwned(toRun.iterator(), name) ) { - if ( shard != null ) { // ignore the unclaimable end of the stream - ran.add(shard); - // do some work here - for ( int sum =0, i = 0; i < 100000; i++) sum += i; - } - } - - } else { - for ( GenomeLoc shard : toRun ) { - //System.out.printf("Claiming ownership in %s on %s%n", name, shard); - ProcessingLoc proc = test.getTracker().claimOwnership(shard,name); - //System.out.printf(" => ownership of %s is %s (I own? %b)%n", shard, proc.getOwner(), proc.isOwnedBy(name)); - if ( proc.isOwnedBy(name) ) { - ran.add(proc.getLocation()); - // do some work here - for ( int sum =0, i = 0; i < 100000; i++) sum += i; - } - //logger.warn(String.format("Thread %s on %s -> owned by %s", name, shard, proc.getOwner())); - } - } - - return 1; - } - } - - private static TestThread findOwner(String name, List threads) { - for ( TestThread thread : threads ) { - if ( thread.name.equals(name) ) - return thread; - } - return null; - } - - private static final void assertAllThreadsFinished(List> futures) { - try { - for ( Future f : futures ) { - Assert.assertTrue(f.isDone(), "Thread never finished running"); - Assert.assertTrue(f.get() != null, "Finished successfully"); - } - } catch (InterruptedException e) { - Assert.fail("Thread failed to run to completion", e); - } catch (ExecutionException e) { - Assert.fail("Thread generated an exception", e); - } - } - - private static final List subList(List l, int i) { - List r = new ArrayList(); - for ( int j = 0; j < l.size(); j++ ) { - if ( j % i == 0 ) - r.add(l.get(j)); - } - - return r; - } - - @Test(dataProvider = "threadData", enabled = true) - public void testThreadedProcessesLowLevelFunctions(TestTarget test) { - testThreading(test, false); - } - - @Test(dataProvider = "threadData", enabled = true) - public void testThreadedProcessesIterator(TestTarget test) { - testThreading(test, true); - } - - private void testThreading(TestTarget test, boolean useIterator) { - if ( ! test.isThreadSafe() ) - // skip tests that aren't thread safe - return; - - // start up 3 threads - logger.warn("ThreadedTesting " + test + " using iterator " + useIterator); - List threads = new ArrayList(); - for ( int i = 0; i < 4; i++) { - List toRun = subList(test.getShards(), i+1); - TestThread thread = new TestThread(test, i, toRun, useIterator); - threads.add(thread); - } - ExecutorService exec = java.util.concurrent.Executors.newFixedThreadPool(threads.size()); - - try { - List> results = exec.invokeAll(threads, 300, TimeUnit.SECONDS); - GenomeLocProcessingTracker tracker = test.getTracker(); - List shards = test.getShards(); - - for ( TestThread thread : threads ) - logger.warn(String.format("TestThread %s ran %d jobs of %d to run", thread.name, thread.ran.size(), thread.toRun.size())); - - assertAllThreadsFinished(results); - - // we ran everything - Assert.assertEquals(tracker.updateAndGetProcessingLocs(NAME_ONE).size(), shards.size(), "Not all shards were run"); - - for ( GenomeLoc shard : shards ) { - Assert.assertTrue(tracker.locIsOwned(shard, NAME_ONE), "Unowned shard"); - - ProcessingLoc proc = tracker.findOwner(shard, NAME_ONE); - Assert.assertNotNull(proc, "Proc was null"); - - Assert.assertNotNull(proc.getOwner(), "Owner was null"); - Assert.assertEquals(proc.getLocation(), shard, "Shard loc doesn't make ProcessingLoc"); - - TestThread owner = findOwner(proc.getOwner(), threads); - Assert.assertNotNull(owner, "Couldn't find owner"); - - Assert.assertTrue(owner.ran.contains(shard), "Owner doesn't contain ran shard"); - - for ( TestThread thread : threads ) - if ( ! proc.isOwnedBy(thread.name) && thread.ran.contains(shard) ) - Assert.fail("Shard appears in another run list: proc=" + proc + " shard=" + shard + " also in jobs of " + thread.name + " obj=" + thread.ran.get(thread.ran.indexOf(shard))); - - } - } catch (InterruptedException e) { - Assert.fail("Thread failure", e); - } - } -} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java index a344817a0..bde4c4ae3 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextIntegrationTest.java @@ -19,14 +19,14 @@ public class VariantContextIntegrationTest extends WalkerTest { static HashMap expectations = new HashMap(); static { - expectations.put("-L 1:1-10000 --printPerLocus", "e4ee2eaa3114888e918a1c82df7a027a"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "5b5635e4877d82e8a27d70dac24bda2f"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "ceced3f270b4fe407ee83bc9028becde"); - expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "9a9b9e283553c28bf58de1cafa38fe92"); + expectations.put("-L 1:1-10000 --printPerLocus", "c44a48dd9062a435a3579145ce8d1684"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly", "fa5762fa7dcb2652ed34bcdce9ecf455"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsStartinAtCurrentPosition", "dfdc554c52707541d335c3fb849feaba"); + expectations.put("-L 1:1-10000 --printPerLocus --takeFirstOnly --onlyContextsStartinAtCurrentPosition", "db8ba72b557ebd698215281e5656b59c"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType SNP", "2097e32988d603d3b353b50218c86d3b"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "033bd952fca048fe1a4f6422b57ab2ed"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "5e40980c02797f90821317874426a87a"); - expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "e5a00766f8c1ff9cf92310bafdec3126"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL", "7f5eadb2098aafdef8bb45aac3722d03"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType INDEL --onlyContextsStartinAtCurrentPosition", "a31b76fb8ed727616d8fb823c62bf677"); + expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType MIXED", "f9d30920c8834ec7c7892507a5052fb7"); expectations.put("-L 1:1-10000 --printPerLocus --onlyContextsOfType NO_VARIATION", "39335acdb34c8a2af433dc50d619bcbc"); } @@ -58,7 +58,7 @@ public class VariantContextIntegrationTest extends WalkerTest { // this really just tests that we are seeing the same number of objects over all of chr1 WalkerTestSpec spec = new WalkerTestSpec( root + " -L 1" + " -o %s", 1, // just one output file - Arrays.asList("529f936aa6c303658b23caf4e527782f")); + Arrays.asList("137258e1dc490bfa83a2294c52e97ba9")); executeTest("testLargeScaleConversion", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index e82817714..d8fa0eae4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -92,45 +92,45 @@ public class VariantContextUnitTest { // test INDELs alleles = Arrays.asList(Aref, ATC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(Tref, TA, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); alleles = Arrays.asList(ATCref, A, Allele.create("ATCTC")); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+2, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.INDEL); // test MIXED alleles = Arrays.asList(TAref, T, TC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(TAref, T, AC); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(ACref, ATC, AT); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop+1, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); alleles = Arrays.asList(Aref, T, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.MIXED); // test SYMBOLIC alleles = Arrays.asList(Tref, symbolic); - vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles); + vc = new VariantContext("test", snpLoc,snpLocStart, snpLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getType(), VariantContext.Type.SYMBOLIC); } @@ -191,7 +191,7 @@ public class VariantContextUnitTest { @Test public void testCreatingDeletionVariantContext() { List alleles = Arrays.asList(ATCref, del); - VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles); + VariantContext vc = new VariantContext("test", delLoc, delLocStart, delLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), delLoc); Assert.assertEquals(vc.getStart(), delLocStart); @@ -218,7 +218,7 @@ public class VariantContextUnitTest { @Test public void testCreatingInsertionVariantContext() { List alleles = Arrays.asList(delRef, ATC); - VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles); + VariantContext vc = new VariantContext("test", insLoc, insLocStart, insLocStop, alleles, null, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, (byte)'A'); Assert.assertEquals(vc.getChr(), insLoc); Assert.assertEquals(vc.getStart(), insLocStart); @@ -251,7 +251,7 @@ public class VariantContextUnitTest { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, del)); } - @Test (expectedExceptions = IllegalArgumentException.class) + @Test (expectedExceptions = IllegalStateException.class) public void testBadConstructorArgs3() { new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(del)); } diff --git a/public/packages/PicardPrivate.xml b/public/packages/PicardPrivate.xml index 110b41d3f..581c47979 100644 --- a/public/packages/PicardPrivate.xml +++ b/public/packages/PicardPrivate.xml @@ -7,6 +7,8 @@ + + diff --git a/public/packages/Queue.xml b/public/packages/Queue.xml index 58da4398e..589cb45f5 100644 --- a/public/packages/Queue.xml +++ b/public/packages/Queue.xml @@ -41,6 +41,7 @@ + diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 6a47d4b97..116d16f35 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -2,7 +2,6 @@ package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript -import org.broadinstitute.sting.queue.function.ListWriterFunction import org.broadinstitute.sting.queue.extensions.picard._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.utils.baq.BAQ.CalculationMode @@ -12,6 +11,7 @@ import net.sf.samtools.SAMFileReader import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.util.QScriptUtils +import org.broadinstitute.sting.queue.function.{CommandLineFunction, ListWriterFunction} class DataProcessingPipeline extends QScript { qscript => @@ -72,6 +72,9 @@ class DataProcessingPipeline extends QScript { @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) var bwaThreads: Int = 1 + @Input(doc="Dont perform validation on the BAM files", fullName="no_validation", shortName="nv", required=false) + var noValidation: Boolean = false + /**************************************************************************** * Global Variables @@ -135,7 +138,7 @@ class DataProcessingPipeline extends QScript { } } - println("\n\n*** DEBUG ***\n") + println("\n\n*** INPUT FILES ***\n") // Creating one file for each sample in the dataset val sampleBamFiles = scala.collection.mutable.Map.empty[String, File] for ((sample, flist) <- sampleTable) { @@ -149,7 +152,7 @@ class DataProcessingPipeline extends QScript { sampleBamFiles(sample) = sampleFileName add(joinBams(flist, sampleFileName)) } - println("*** DEBUG ***\n\n") + println("*** INPUT FILES ***\n\n") return sampleBamFiles.toMap } @@ -246,7 +249,12 @@ class DataProcessingPipeline extends QScript { val preValidateLog = swapExt(bam, ".bam", ".pre.validation") val postValidateLog = swapExt(bam, ".bam", ".post.validation") - add(validate(bam, preValidateLog)) + // Validation is an optional step for the BAM file generated after + // alignment and the final bam file of the pipeline. + if (!noValidation) { + add(validate(bam, preValidateLog), + validate(recalBam, postValidateLog)) + } if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) add(target(bam, targetIntervals)) @@ -257,8 +265,8 @@ class DataProcessingPipeline extends QScript { recal(dedupedBam, preRecalFile, recalBam), cov(recalBam, postRecalFile), analyzeCovariates(preRecalFile, preOutPath), - analyzeCovariates(postRecalFile, postOutPath), - validate(recalBam, postValidateLog)) + analyzeCovariates(postRecalFile, postOutPath)) + cohortList :+= recalBam } @@ -275,20 +283,29 @@ class DataProcessingPipeline extends QScript { ****************************************************************************/ - // General arguments to GATK walkers - trait CommandLineGATKArgs extends CommandLineGATK { - this.reference_sequence = qscript.reference + + // General arguments to non-GATK tools + trait ExternalCommonArgs extends CommandLineFunction { this.memoryLimit = 4 this.isIntermediate = true } + // General arguments to GATK walkers + trait CommandLineGATKArgs extends CommandLineGATK with ExternalCommonArgs { + this.reference_sequence = qscript.reference + } + + trait SAMargs extends PicardBamFunction with ExternalCommonArgs { + this.maxRecordsInRam = 100000 + } + case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs { if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY) this.input_file :+= inBams this.out = outIntervals this.mismatchFraction = 0.0 this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - if (!indels.isEmpty) + if (indels != null) this.rodBind :+= RodBind("indels", "VCF", indels) this.scatterCount = nContigs this.analysisName = queueLogDir + outIntervals + ".target" @@ -300,8 +317,8 @@ class DataProcessingPipeline extends QScript { this.targetIntervals = tIntervals this.out = outBam this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) - if (!indels.isEmpty) - this.rodBind :+= RodBind("indels", "VCF", indels) + if (qscript.indels != null) + this.rodBind :+= RodBind("indels", "VCF", qscript.indels) this.consensusDeterminationModel = consensusDeterminationModel this.compress = 0 this.scatterCount = nContigs @@ -332,7 +349,6 @@ class DataProcessingPipeline extends QScript { this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" - } @@ -350,48 +366,44 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" } - case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates { + case class dedup (inBam: File, outBam: File, metricsFile: File) extends MarkDuplicates with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = List(inBam) this.output = outBam this.metrics = metricsFile - this.memoryLimit = 6 - this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".dedup" this.jobName = queueLogDir + outBam + ".dedup" } - case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles { + case class joinBams (inBams: List[File], outBam: File) extends MergeSamFiles with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = inBams this.output = outBam - this.memoryLimit = 4 - this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".joinBams" this.jobName = queueLogDir + outBam + ".joinBams" } - case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam { + case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = List(inSam) this.output = outBam this.sortOrder = sortOrderP - this.memoryLimit = 4 - this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".sortSam" this.jobName = queueLogDir + outBam + ".sortSam" } - case class validate (inBam: File, outLog: File) extends ValidateSamFile { + case class validate (inBam: File, outLog: File) extends ValidateSamFile with ExternalCommonArgs { this.input = List(inBam) this.output = outLog - this.maxRecordsInRam = 100000 this.REFERENCE_SEQUENCE = qscript.reference - this.memoryLimit = 4 this.isIntermediate = false this.analysisName = queueLogDir + outLog + ".validate" this.jobName = queueLogDir + outLog + ".validate" } - case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups { + case class addReadGroup (inBam: File, outBam: File, readGroup: ReadGroup) extends AddOrReplaceReadGroups with ExternalCommonArgs { + @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai") this.input = List(inBam) this.output = outBam this.RGID = readGroup.id @@ -401,18 +413,11 @@ class DataProcessingPipeline extends QScript { this.RGPL = readGroup.pl this.RGPU = readGroup.pu this.RGSM = readGroup.sm - this.memoryLimit = 4 - this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".rg" this.jobName = queueLogDir + outBam + ".rg" } - trait BWACommonArgs extends CommandLineFunction { - this.memoryLimit = 4 - this.isIntermediate = true - } - - case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs { + case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai @@ -420,7 +425,7 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outSai + ".bwa_aln_se" } - case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs { + case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file for 1st mating pair") var sai = outSai1 def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai @@ -428,21 +433,23 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } - case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with BWACommonArgs { + case class bwa_sam_se (inBam: File, inSai: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Input(doc="bwa alignment index file") var sai = inSai @Output(doc="output aligned bam file") var alignedBam = outBam def commandLine = bwaPath + " samse " + reference + " " + sai + " " + bam + " > " + alignedBam + this.memoryLimit = 6 this.analysisName = queueLogDir + outBam + ".bwa_sam_se" this.jobName = queueLogDir + outBam + ".bwa_sam_se" } - case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with BWACommonArgs { + case class bwa_sam_pe (inBam: File, inSai1: File, inSai2:File, outBam: File) extends CommandLineFunction with ExternalCommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Input(doc="bwa alignment index file for 1st mating pair") var sai1 = inSai1 @Input(doc="bwa alignment index file for 2nd mating pair") var sai2 = inSai2 @Output(doc="output aligned bam file") var alignedBam = outBam def commandLine = bwaPath + " sampe " + reference + " " + sai1 + " " + sai2 + " " + bam + " " + bam + " > " + alignedBam + this.memoryLimit = 6 this.analysisName = queueLogDir + outBam + ".bwa_sam_pe" this.jobName = queueLogDir + outBam + ".bwa_sam_pe" } @@ -453,6 +460,4 @@ class DataProcessingPipeline extends QScript { this.analysisName = queueLogDir + outBamList + ".bamList" this.jobName = queueLogDir + outBamList + ".bamList" } - - } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 150d78019..934cf2a3c 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -15,8 +15,8 @@ class GATKResourcesBundle extends QScript { @Argument(doc="liftOverPerl", required=false) var liftOverPerl: File = new File("./perl/liftOverVCF.pl") - @Argument(shortName = "svn", doc="The SVN version of this release", required=true) - var SVN_VERSION: String = _ + @Argument(shortName = "ver", doc="The SVN version of this release", required=true) + var VERSION: String = _ @Argument(shortName = "bundleDir", doc="Path to root where resource files will be placed", required=false) val BUNDLE_ROOT = new File("/humgen/gsa-hpprojects/GATK/bundle") @@ -32,8 +32,8 @@ class GATKResourcesBundle extends QScript { val SITES_EXT: String = "sites" - def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + SVN_VERSION - def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + SVN_VERSION + def BUNDLE_DIR: File = BUNDLE_ROOT + "/" + VERSION + def DOWNLOAD_DIR: File = DOWNLOAD_ROOT + "/" + VERSION // REFERENCES class Reference( val name: String, val file: File ) { } @@ -113,6 +113,12 @@ class GATKResourcesBundle extends QScript { addResource(new Resource(hg19.file, "", hg19, false)) addResource(new Resource(hg18.file, "", hg18, false)) + // + // The b37_decoy reference + // + addResource(new Resource("/humgen/1kg/reference/human_g1k_v37_decoy.fasta", + "IGNORE", b37, false, false)) + // // standard VCF files. Will be lifted to each reference // diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index fca420816..cbe53db8d 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -20,14 +20,14 @@ class RecalibrateBaseQualities extends QScript { @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) var input: File = _ - @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) - var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") + @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=true) + var R: String = _ - @Input(doc="Reference fasta file", shortName="R", required=false) - var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + @Input(doc="Reference fasta file", shortName="R", required=true) + var reference: File = _ // new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") - @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) - var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=true) + var dbSNP: File = _ // new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") val queueLogDir: String = ".qlog/" var nContigs: Int = 0 @@ -42,8 +42,8 @@ class RecalibrateBaseQualities extends QScript { val recalFile1: File = swapExt(bam, ".bam", ".recal1.csv") val recalFile2: File = swapExt(bam, ".bam", ".recal2.csv") val recalBam: File = swapExt(bam, ".bam", ".recal.bam") - val path1: String = bam + ".before" - val path2: String = bam + ".after" + val path1: String = recalBam + ".before" + val path2: String = recalBam + ".after" add(cov(bam, recalFile1), recal(bam, recalFile1, recalBam), diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 4a93233eb..1d473b210 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -59,10 +59,10 @@ class ExampleUnifiedGenotyper extends QScript { evalUnfiltered.rodBind :+= RodBind("eval", "VCF", genotyper.out) evalUnfiltered.out = swapExt(genotyper.out, "vcf", "eval") - variantFilter.rodBind :+= RodBind("vcf", "VCF", genotyper.out) + variantFilter.rodBind :+= RodBind("variant", "VCF", genotyper.out) variantFilter.out = swapExt(qscript.bamFile, "bam", "filtered.vcf") variantFilter.filterName = filterNames - variantFilter.filterExpression = filterExpressions + variantFilter.filterExpression = filterExpressions.map("\"" + _ + "\"") evalFiltered.rodBind :+= RodBind("eval", "VCF", variantFilter.out) evalFiltered.out = swapExt(variantFilter.out, "vcf", "eval") diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 71970a36b..05c1a1775 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -45,7 +45,7 @@ class QSettings { var jobPriority: Option[Int] = None @Argument(fullName="default_memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) - var memoryLimit: Option[Int] = None + var memoryLimit: Option[Double] = None @Argument(fullName="run_directory", shortName="runDir", doc="Root directory to run functions from.", required=false) var runDirectory = new File(".") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala index 2fbfab5ec..2e3108136 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/CommandLineJobRunner.scala @@ -33,12 +33,29 @@ import org.broadinstitute.sting.queue.util.{Logging, IOUtils} */ trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging { + /** The string representation of the identifier of the running job. */ + def jobIdString: String = null + /** A generated exec shell script. */ protected var jobScript: File = _ /** Which directory to use for the job status files. */ protected def jobStatusDir = function.jobTempDir + /** Amount of time a job can go without status before giving up. */ + private val unknownStatusMaxSeconds = 5 * 60 + + /** Last known status */ + protected var lastStatus: RunnerStatus.Value = _ + + /** The last time the status was updated */ + protected var lastStatusUpdate: Long = _ + + final override def status = this.lastStatus + + def residentRequestMB: Option[Double] = function.memoryLimit.map(_ * 1024) + def residentLimitMB: Option[Double] = residentRequestMB.map( _ * 1.2 ) + override def init() { super.init() var exec = new StringBuilder @@ -53,7 +70,21 @@ trait CommandLineJobRunner extends JobRunner[CommandLineFunction] with Logging { } exec.append(function.commandLine) - this.jobScript = IOUtils.writeTempFile(exec.toString, ".exec", "", jobStatusDir) + this.jobScript = IOUtils.writeTempFile(exec.toString(), ".exec", "", jobStatusDir) + } + + protected def updateStatus(updatedStatus: RunnerStatus.Value) { + this.lastStatus = updatedStatus + this.lastStatusUpdate = System.currentTimeMillis + } + + override def checkUnknownStatus() { + val unknownStatusMillis = (System.currentTimeMillis - lastStatusUpdate) + if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) { + // Unknown status has been returned for a while now. + updateStatus(RunnerStatus.FAILED) + logger.error("Unable to read status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), jobIdString, function.description)) + } } override def cleanup() { diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala index d2be4939a..30187f7e2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobManager.scala @@ -44,9 +44,9 @@ trait JobManager[TFunction <: QFunction, TRunner <: JobRunner[TFunction]] { /** * Updates the status on a list of functions. * @param runners Runners to update. + * @return runners which were updated. */ - def updateStatus(runners: Set[TRunner]) { - } + def updateStatus(runners: Set[TRunner]): Set[TRunner] = Set.empty /** * Stops a list of functions. diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala index 4b4d44988..de5fbde05 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/JobRunner.scala @@ -52,6 +52,11 @@ trait JobRunner[TFunction <: QFunction] { */ def status: RunnerStatus.Value + /** + * Checks if the status has been unknown for an extended period of time. + */ + def checkUnknownStatus() {} + /** * Returns the function to be run. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index 8ed3f84c1..a52e9c561 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -1005,7 +1005,10 @@ class QGraph extends Logging { .asInstanceOf[Set[JobRunner[QFunction]]] if (managerRunners.size > 0) try { - manager.updateStatus(managerRunners) + val updatedRunners = manager.updateStatus(managerRunners) + for (runner <- managerRunners.diff(updatedRunners)) { + runner.checkUnknownStatus() + } } catch { case e => /* ignore */ } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala index 82edf6221..8c639b5bb 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/gridengine/GridEngineJobRunner.scala @@ -40,12 +40,7 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine /** Job Id of the currently executing job. */ private var jobId: String = _ - - /** Last known status */ - private var lastStatus: RunnerStatus.Value = _ - - /** The last time the status was updated */ - protected var lastStatusUpdate: Long = _ + override def jobIdString = jobId def start() { GridEngineJobRunner.gridEngineSession.synchronized { @@ -82,11 +77,14 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine nativeSpecString += " -q " + function.jobQueue } - // If the memory limit is set (GB) specify the memory limit - if (function.memoryLimit.isDefined) { - val memAvl: String = function.memoryLimit.get + "G" - val memMax: String = (function.memoryLimit.get * 1.2 * 1024).ceil.toInt + "M" - nativeSpecString += " -l mem_free=" + memAvl + ",h_rss=" + memMax + // If the resident set size is requested pass on the memory request + if (residentRequestMB.isDefined) { + nativeSpecString += " -l mem_free=%dM".format(residentRequestMB.get.ceil.toInt) + } + + // If the resident set size limit is defined specify the memory limit + if (residentLimitMB.isDefined) { + nativeSpecString += " -l h_rss=%dM".format(residentLimitMB.get.ceil.toInt) } // If the priority is set (user specified Int) specify the priority @@ -121,21 +119,11 @@ class GridEngineJobRunner(val function: CommandLineFunction) extends CommandLine logger.info("Submitted Grid Engine job id: " + jobId) } } - - def status = this.lastStatus - - private def updateStatus(updatedStatus: RunnerStatus.Value) { - this.lastStatus = updatedStatus - this.lastStatusUpdate = System.currentTimeMillis - } } object GridEngineJobRunner extends Logging { private val gridEngineSession = SessionFactory.getFactory.getSession - /** Amount of time a job can go without status before giving up. */ - private val unknownStatusMaxSeconds = 5 * 60 - initGridEngine() /** @@ -156,16 +144,14 @@ object GridEngineJobRunner extends Logging { /** * Updates the status of a list of jobs. * @param runners Runners to update. + * @return runners which were updated. */ - def updateStatus(runners: Set[GridEngineJobRunner]) { + def updateStatus(runners: Set[GridEngineJobRunner]) = { var updatedRunners = Set.empty[GridEngineJobRunner] gridEngineSession.synchronized { runners.foreach(runner => if (updateRunnerStatus(runner)) {updatedRunners += runner}) } - - for (runner <- runners.diff(updatedRunners)) { - checkUnknownStatus(runner) - } + updatedRunners } /** @@ -219,20 +205,11 @@ object GridEngineJobRunner extends Logging { logger.warn("Unable to determine status of Grid Engine job id " + runner.jobId, de) } - Option(returnStatus) match { - case Some(returnStatus) => - runner.updateStatus(returnStatus) - return true - case None => return false - } - } - - private def checkUnknownStatus(runner: GridEngineJobRunner) { - val unknownStatusSeconds = (System.currentTimeMillis - runner.lastStatusUpdate) - if (unknownStatusSeconds > (unknownStatusMaxSeconds * 1000L)) { - // Unknown status has been returned for a while now. - runner.updateStatus(RunnerStatus.FAILED) - logger.error("Unable to read Grid Engine status for %d minutes: job id %d: %s".format(unknownStatusSeconds/60, runner.jobId, runner.function.description)) + if (returnStatus != null) { + runner.updateStatus(returnStatus) + true + } else { + false } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala index c0fff9125..23ddab619 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobManager.scala @@ -34,6 +34,6 @@ class Lsf706JobManager extends CommandLineJobManager[Lsf706JobRunner] { def runnerType = classOf[Lsf706JobRunner] def create(function: CommandLineFunction) = new Lsf706JobRunner(function) - override def updateStatus(runners: Set[Lsf706JobRunner]) { Lsf706JobRunner.updateStatus(runners) } + override def updateStatus(runners: Set[Lsf706JobRunner]) = { Lsf706JobRunner.updateStatus(runners) } override def tryStop(runners: Set[Lsf706JobRunner]) { Lsf706JobRunner.tryStop(runners) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index ac2f036b4..46dd08332 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -32,8 +32,8 @@ import org.broadinstitute.sting.utils.Utils import org.broadinstitute.sting.jna.clibrary.LibC import org.broadinstitute.sting.jna.lsf.v7_0_6.LibBat.{submitReply, submit} import com.sun.jna.ptr.IntByReference -import com.sun.jna.{StringArray, NativeLong} import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} +import com.sun.jna.{Structure, StringArray, NativeLong} /** * Runs jobs on an LSF compute cluster. @@ -45,12 +45,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR /** Job Id of the currently executing job. */ private var jobId = -1L - - /** Last known status */ - private var lastStatus: RunnerStatus.Value = _ - - /** The last time the status was updated */ - protected var lastStatusUpdate: Long = _ + override def jobIdString = jobId.toString /** * Dispatches the function on the LSF cluster. @@ -85,12 +80,19 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR request.options |= LibBat.SUB_QUEUE } - // If the memory limit is set (GB) specify the memory limit - if (function.memoryLimit.isDefined) { - request.resReq = "rusage[mem=" + function.memoryLimit.get + "]" + // If the resident set size is requested pass on the memory request + if (residentRequestMB.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(residentRequestMB.get) + request.resReq = "select[mem>%1$d] rusage[mem=%1$d]".format(memInUnits) request.options |= LibBat.SUB_RES_REQ } + // If the resident set size limit is defined specify the memory limit + if (residentLimitMB.isDefined) { + val memInUnits = Lsf706JobRunner.convertUnits(residentLimitMB.get) + request.rLimits(LibLsf.LSF_RLIMIT_RSS) = memInUnits + } + // If the priority is set (user specified Int) specify the priority if (function.jobPriority.isDefined) { request.userPriority = function.jobPriority.get @@ -122,11 +124,13 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR } } - def status = this.lastStatus - - private def updateStatus(updatedStatus: RunnerStatus.Value) { - this.lastStatus = updatedStatus - this.lastStatusUpdate = System.currentTimeMillis + override def checkUnknownStatus() { + // TODO: Need a second pass through either of the two archive logs using lsb_geteventrecbyline() for disappeared jobs. + // Can also tell if we wake up and the last time we saw status was greater than lsb_parameterinfo().cleanPeriod + // LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct) + // LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist) + logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(jobId)) + super.checkUnknownStatus() } } @@ -137,17 +141,8 @@ object Lsf706JobRunner extends Logging { /** Number of seconds for a non-normal exit status before we give up on expecting LSF to retry the function. */ private val retryExpiredSeconds = 5 * 60 - /** Amount of time a job can go without status before giving up. */ - private val unknownStatusMaxSeconds = 5 * 60 - initLsf() - /** The name of the default queue. */ - private var defaultQueue: String = _ - - /** The run limits for each queue. */ - private var queueRlimitRun = Map.empty[String,Int] - /** * Initialize the Lsf library. */ @@ -161,8 +156,9 @@ object Lsf706JobRunner extends Logging { /** * Bulk updates job statuses. * @param runners Runners to update. + * @return runners which were updated. */ - def updateStatus(runners: Set[Lsf706JobRunner]) { + def updateStatus(runners: Set[Lsf706JobRunner]) = { var updatedRunners = Set.empty[Lsf706JobRunner] Lsf706JobRunner.lsfLibLock.synchronized { @@ -192,70 +188,7 @@ object Lsf706JobRunner extends Logging { } } - for (runner <- runners.diff(updatedRunners)) { - checkUnknownStatus(runner) - } - } - - /** - * Tries to stop any running jobs. - * @param runners Runners to stop. - */ - def tryStop(runners: Set[Lsf706JobRunner]) { - lsfLibLock.synchronized { - // lsb_killbulkjobs does not seem to forward SIGTERM, - // only SIGKILL, so send the Ctrl-C (SIGTERM) one by one. - for (runner <- runners.filterNot(_.jobId < 0)) { - try { - if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0) - logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId)) - } catch { - case e => - logger.error("Unable to kill job " + runner.jobId, e) - } - } - } - } - - - /** - * Returns the run limit in seconds for the queue. - * If the queue name is null returns the length of the default queue. - * @param queue Name of the queue or null for the default queue. - * @return the run limit in seconds for the queue. - */ - private def getRlimitRun(queue: String) = { - lsfLibLock.synchronized { - if (queue == null) { - if (defaultQueue != null) { - queueRlimitRun(defaultQueue) - } else { - // Get the info on the default queue. - val numQueues = new IntByReference(1) - val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0) - if (queueInfo == null) - throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue")) - defaultQueue = queueInfo.queue - val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN) - queueRlimitRun += defaultQueue -> limit - limit - } - } else { - queueRlimitRun.get(queue) match { - case Some(limit) => limit - case None => - // Cache miss. Go get the run limits from LSF. - val queues = new StringArray(Array[String](queue)) - val numQueues = new IntByReference(1) - val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0) - if (queueInfo == null) - throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue)) - val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN) - queueRlimitRun += queue -> limit - limit - } - } - } + updatedRunners } private def updateRunnerStatus(runner: Lsf706JobRunner, jobInfo: LibBat.jobInfoEnt) { @@ -280,20 +213,6 @@ object Lsf706JobRunner extends Logging { ) } - private def checkUnknownStatus(runner: Lsf706JobRunner) { - // TODO: Need a second pass through either of the two archive logs using lsb_geteventrecbyline() for disappeared jobs. - // Can also tell if we wake up and the last time we saw status was greater than lsb_parameterinfo().cleanPeriod - // LSB_SHAREDIR/cluster_name/logdir/lsb.acct (man bacct) - // LSB_SHAREDIR/cluster_name/logdir/lsb.events (man bhist) - logger.debug("Job Id %s status / exitStatus / exitInfo: ??? / ??? / ???".format(runner.jobId)) - val unknownStatusMillis = (System.currentTimeMillis - runner.lastStatusUpdate) - if (unknownStatusMillis > (unknownStatusMaxSeconds * 1000L)) { - // Unknown status has been returned for a while now. - runner.updateStatus(RunnerStatus.FAILED) - logger.error("Unable to read LSF status for %0.2f minutes: job id %d: %s".format(unknownStatusMillis/(60 * 1000D), runner.jobId, runner.function.description)) - } - } - /** * Returns true if LSF is expected to retry running the function. * @param exitInfo The reason the job exited. @@ -309,4 +228,86 @@ object Lsf706JobRunner extends Logging { } } } + + /** + * Tries to stop any running jobs. + * @param runners Runners to stop. + */ + def tryStop(runners: Set[Lsf706JobRunner]) { + lsfLibLock.synchronized { + // lsb_killbulkjobs does not seem to forward SIGTERM, + // only SIGKILL, so send the Ctrl-C (SIGTERM) one by one. + for (runner <- runners.filterNot(_.jobId < 0)) { + try { + if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0) + logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId)) + } catch { + case e => + logger.error("Unable to kill job " + runner.jobId, e) + } + } + } + } + + /** The name of the default queue. */ + private lazy val defaultQueue: String = { + lsfLibLock.synchronized { + val numQueues = new IntByReference(1) + val queueInfo = LibBat.lsb_queueinfo(null, numQueues, null, null, 0) + if (queueInfo == null) + throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for the default queue")) + queueInfo.queue + } + } + + /** The run limits for each queue. */ + private var queueRlimitRun = Map.empty[String,Int] + + /** + * Returns the run limit in seconds for the queue. + * If the queue name is null returns the length of the default queue. + * @param queue Name of the queue or null for the default queue. + * @return the run limit in seconds for the queue. + */ + private def getRlimitRun(queueName: String) = { + lsfLibLock.synchronized { + val queue = if (queueName == null) defaultQueue else queueName + queueRlimitRun.get(queue) match { + case Some(limit) => limit + case None => + // Cache miss. Go get the run limits from LSF. + val queues = new StringArray(Array(queue)) + val numQueues = new IntByReference(1) + val queueInfo = LibBat.lsb_queueinfo(queues, numQueues, null, null, 0) + if (queueInfo == null) + throw new QException(LibBat.lsb_sperror("Unable to get LSF queue info for queue: " + queue)) + val limit = queueInfo.rLimits(LibLsf.LSF_RLIMIT_RUN) + queueRlimitRun += queue -> limit + limit + } + } + } + + private lazy val unitDivisor: Double = { + lsfLibLock.synchronized { + val unitsParam: Array[LibLsf.config_param] = new LibLsf.config_param().toArray(2).asInstanceOf[Array[LibLsf.config_param]] + unitsParam(0).paramName = "LSF_UNIT_FOR_LIMITS" + + Structure.autoWrite(unitsParam.asInstanceOf[Array[Structure]]) + if (LibLsf.ls_readconfenv(unitsParam(0), null) != 0) + throw new QException(LibBat.lsb_sperror("ls_readconfenv() failed")) + Structure.autoRead(unitsParam.asInstanceOf[Array[Structure]]) + + unitsParam(0).paramValue match { + case "MB" => 1D + case "GB" => 1024D + case "TB" => 1024D * 1024 + case "PB" => 1024D * 1024 * 1024 + case "EB" => 1024D * 1024 * 1024 * 1024 + case null => 1D + } + } + } + + private def convertUnits(mb: Double) = (mb / unitDivisor).ceil.toInt } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala index 603511a30..03f9d3315 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala @@ -50,10 +50,10 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu // Allow advanced users to update the job. updateJobRun(job) - runStatus = RunnerStatus.RUNNING + updateStatus(RunnerStatus.RUNNING) job.run() - runStatus = RunnerStatus.DONE + updateStatus(RunnerStatus.DONE) } - def status = runStatus + override def checkUnknownStatus() {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 2b1abb2d0..c62fdcd7c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -9,7 +9,7 @@ trait CommandLineFunction extends QFunction with Logging { def commandLine: String /** Upper memory limit */ - var memoryLimit: Option[Int] = None + var memoryLimit: Option[Double] = None /** Job project to run the command */ var jobProject: String = _ @@ -56,7 +56,7 @@ trait CommandLineFunction extends QFunction with Logging { if (memoryLimit.isEmpty) memoryLimit = qSettings.memoryLimit - super.freezeFieldValues + super.freezeFieldValues() } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index 72445442e..e8279f62b 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -47,7 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { /** * Memory limit for the java executable, or if None will use the default memoryLimit. */ - var javaMemoryLimit: Option[Int] = None + var javaMemoryLimit: Option[Double] = None /** * Returns the java executable to run. @@ -61,8 +61,8 @@ trait JavaCommandLineFunction extends CommandLineFunction { null } - override def freezeFieldValues = { - super.freezeFieldValues + override def freezeFieldValues() { + super.freezeFieldValues() if (javaMemoryLimit.isEmpty && memoryLimit.isDefined) javaMemoryLimit = memoryLimit @@ -72,7 +72,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { } def javaOpts = "%s -Djava.io.tmpdir=%s" - .format(optional(" -Xmx", javaMemoryLimit, "g"), jobTempDir) + .format(optional(" -Xmx", javaMemoryLimit.map(gb => (gb * 1024).ceil.toInt), "m"), jobTempDir) def commandLine = "java%s %s" .format(javaOpts, javaExecutable) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index c2c956118..27ac559c5 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -34,8 +34,8 @@ import org.broadinstitute.sting.BaseTest import org.broadinstitute.sting.MD5DB import org.broadinstitute.sting.queue.QCommandLine import org.broadinstitute.sting.queue.util.{Logging, ProcessController} -import java.io.{FileNotFoundException, File} -import org.broadinstitute.sting.gatk.report.GATKReportParser +import java.io.File +import org.broadinstitute.sting.gatk.report.GATKReport import org.apache.commons.io.FileUtils import org.broadinstitute.sting.queue.engine.CommandLinePluginManager @@ -118,12 +118,11 @@ object PipelineTest extends BaseTest with Logging { // write the report to the shared validation data location val formatter = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss") val reportLocation = "%s%s/%s/validation.%s.eval".format(validationReportsDataLocation, jobRunner, name, formatter.format(new Date)) - val report = new File(reportLocation) + val reportFile = new File(reportLocation) - FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), report); + FileUtils.copyFile(new File(runDir(name, jobRunner) + evalSpec.evalReport), reportFile); - val parser = new GATKReportParser - parser.parse(report) + val report = new GATKReport(reportFile); var allInRange = true @@ -131,7 +130,9 @@ object PipelineTest extends BaseTest with Logging { println(name + " validation values:") println(" value (min,target,max) table key metric") for (validation <- evalSpec.validations) { - val value = parser.getValue(validation.table, validation.key, validation.metric) + val table = report.getTable(validation.table) + val key = table.getPrimaryKey(validation.key) + val value = String.valueOf(table.get(key, validation.metric)) val inRange = if (value == null) false else validation.inRange(value) val flag = if (!inRange) "*" else " " println(" %s %s (%s,%s,%s) %s %s %s".format(flag, value, validation.min, validation.target, validation.max, validation.table, validation.key, validation.metric)) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index 0871e769b..7c76823da 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} class HelloWorldPipelineTest { @Test - def testHelloWorld { + def testHelloWorld() { val spec = new PipelineTestSpec spec.name = "HelloWorld" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala" @@ -37,15 +37,23 @@ class HelloWorldPipelineTest { } @Test - def testHelloWorldWithPrefix { + def testHelloWorldWithPrefix() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPrefix" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPrefix HelloWorld" PipelineTest.executeTest(spec) } + @Test + def testHelloWorldWithMemoryLimit() { + val spec = new PipelineTestSpec + spec.name = "HelloWorldWithPrefix" + spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -memLimit 1.25" + PipelineTest.executeTest(spec) + } + @Test(enabled=false) - def testHelloWorldWithPriority { + def testHelloWorldWithPriority() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" spec.args = "-S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/HelloWorld.scala -jobPriority 100" diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html new file mode 100644 index 000000000..1554a1d40 --- /dev/null +++ b/settings/helpTemplates/common.html @@ -0,0 +1,15 @@ +<#macro makeHeader title> + + ${title} + + + + +<#macro headerInfo> + + +<#macro footerInfo> +

See also Main index | GATK wiki | GATK support forum

+

GATK version ${version} built at ${timestamp}.

+ + diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html new file mode 100644 index 000000000..6c9e9f4e8 --- /dev/null +++ b/settings/helpTemplates/generic.index.template.html @@ -0,0 +1,36 @@ +<#include "common.html"/> + +<#macro emitGroup group> + +

${group.name}

+

+ ${group.summary} +

+

+ + + + <#list data as datum> + <#if datum.group == group.name> + + + + + + +
NameSummary
${datum.name}${datum.summary}
+ + + +<@makeHeader title="GATK documentation index"/> + +

GATK documentation index

+ <@headerInfo /> + <#list groups?sort_by("name") as group> + <@emitGroup group=group/> + + + <@footerInfo /> + + + diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html new file mode 100644 index 000000000..032407164 --- /dev/null +++ b/settings/helpTemplates/generic.template.html @@ -0,0 +1,120 @@ +<#include "common.html"/> + +<#macro argumentlist name myargs> + <#if myargs?size != 0> + ${name} + <#list myargs as arg> + + ${arg.name} + ${arg.type} + ${arg.defaultValue!"No default"} + ${arg.summary} + + <#-- + ${arg.required} + --> + + + + +<#macro argumentDetails arg> +

${arg.name}<#if arg.synonyms??> / ${arg.synonyms} + (<#if arg.attributes??>${arg.attributes} ${arg.type}<#if arg.defaultValue??> with default value ${arg.defaultValue})

+ ${arg.summary}. ${arg.fulltext}
+ <#if arg.options??> +

The ${arg.name} argument is an enumerated type (${arg.type}), which can have one of the following values:

+
+ <#list arg.options as option> +
${option.name} +
${option.summary} + +
+ + + +<#macro relatedByType name type> + <#list relatedDocs as relatedDoc> + <#if relatedDoc.relation == type> +

${name}

+
    + <#list relatedDocs as relatedDoc> + <#if relatedDoc.relation == type> +
  • ${relatedDoc.name} is a ${relatedDoc.relation}
  • + + +
+ <#break> + + + + + +<@makeHeader title="${name} documentation"/> + +

${name}

+ <@headerInfo /> +

${summary}

+ <#if author??> +

Author

+ ${author} + +

Introduction

+ ${description} + + <#-- Create the argument summary --> + <#if arguments.all?size != 0> +
+

${name} specific arguments

+ + + + + + + + + + + <@argumentlist name="Required" myargs=arguments.required/> + <@argumentlist name="Optional" myargs=arguments.optional/> + <@argumentlist name="Hidden" myargs=arguments.hidden/> + <@argumentlist name="Depreciated" myargs=arguments.depreciated/> + +
NameTypeDefault valueSummary
+ + + <#-- Create references to additional capabilities if appropriate --> + <#if extradocs?size != 0> +
+

Additional capabilities

+ The arguments described in the entries below can be supplied to this tool to modify + its behavior. For example, the -L argument directs the GATK engine restricts processing + to specific genomic intervals. This capability is available to all GATK walkers. + + + + <#-- This class is related to other documented classes via sub/super relationships --> + <#if relatedDocs?size != 0> +
+

Related capabilities

+ <@relatedByType name="Superclasses" type="superclass"/> + <@relatedByType name="Subclasses" type="subclass"/> + + + <#-- List all of the --> + <#if arguments.all?size != 0> +
+ <#-- Create the argument details --> +

Argument details

+ <#list arguments.all as arg> + <@argumentDetails arg=arg/> + + + + <@footerInfo /> + + diff --git a/settings/helpTemplates/style.css b/settings/helpTemplates/style.css new file mode 100644 index 000000000..1d7bcc576 --- /dev/null +++ b/settings/helpTemplates/style.css @@ -0,0 +1,134 @@ +body +{ + background-color: #ffffff; + color: #202020; +} + +body, p, ul, ol, dl +{ + font-family: Corbel, Verdana, "Lucida Grande", "Lucida Sans Unicode", Sans-Serif; +} + +p, ul, ol, dl, dt, dd, td +{ + font-size: 12pt; +} + +p +{ + margin-left: 1em; +} + +p.summary +{ + margin-left: 2em; + margin-top: -20pt; + font-style: italic; +} + +p.see-also +{ + font-size: 10pt; + margin-left: 0em; + margin-top: 3em; + text-align: center; +} + +p.version +{ + font-size: 8pt; + margin-left: 0em; + margin-top: -8pt; + text-align: center; +} + + +h1, h2, h3, h4 +{ + font-family: Corbel, Arial, Helvetica, Sans-Serif; + font-weight: bold; + text-align: left; +} + +h1 +{ + font-size: 32pt; + letter-spacing: -2px; + color: #669; +} + +h2 +{ + font-size: 16pt; + font-weight: bold; + margin-top: 2em; + color: #669; +} + +h3 +{ + font-size: 12pt; + margin-left: 1em; + color: #000; +} + +hr +{ + margin-top: 4em; +} + +/* + * enum DT layout +*/ + +dl { + border: 1px solid #ccc; +} + +dt { + font-weight: bold; + text-decoration: underline; +} + +dd { + margin: 0; + padding: 0 0 0.5em 0; +} + +/* + * clean table layouts +*/ +#hor-minimalist-b +{ + font-family: "Lucida Sans Unicode", "Lucida Grande", Sans-Serif; + font-size: 12px; + background: #fff; + margin: 5px; + width: 100%; + border-collapse: collapse; + text-align: left; +} +#hor-minimalist-b th +{ + font-size: 14px; + font-weight: normal; + color: #039; + padding: 10px 8px; + border-bottom: 2px solid #6678b1; +} +#hor-minimalist-b td +{ + border-bottom: 1px solid #ccc; + color: #669; + padding: 6px 8px; +} +#hor-minimalist-b tbody tr:hover td +{ + color: #009; +} + +th#row-divider +{ + font-weight: bolder; + font-size: larger; +} \ No newline at end of file diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1954.jar b/settings/repository/edu.mit.broad/picard-private-parts-1954.jar deleted file mode 100644 index 67637d3d9..000000000 Binary files a/settings/repository/edu.mit.broad/picard-private-parts-1954.jar and /dev/null differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1954.xml~ b/settings/repository/edu.mit.broad/picard-private-parts-1954.xml~ deleted file mode 100644 index 07d51ae53..000000000 --- a/settings/repository/edu.mit.broad/picard-private-parts-1954.xml~ +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1959.jar b/settings/repository/edu.mit.broad/picard-private-parts-1959.jar new file mode 100644 index 000000000..ae11e636b Binary files /dev/null and b/settings/repository/edu.mit.broad/picard-private-parts-1959.jar differ diff --git a/settings/repository/edu.mit.broad/picard-private-parts-1954.xml b/settings/repository/edu.mit.broad/picard-private-parts-1959.xml similarity index 58% rename from settings/repository/edu.mit.broad/picard-private-parts-1954.xml rename to settings/repository/edu.mit.broad/picard-private-parts-1959.xml index c702fd6e5..e7c7e3a21 100644 --- a/settings/repository/edu.mit.broad/picard-private-parts-1954.xml +++ b/settings/repository/edu.mit.broad/picard-private-parts-1959.xml @@ -1,3 +1,3 @@ - + diff --git a/settings/repository/net.sf/picard-1.48.889.xml b/settings/repository/net.sf/picard-1.48.889.xml deleted file mode 100644 index 877687930..000000000 --- a/settings/repository/net.sf/picard-1.48.889.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/picard-1.48.889.jar b/settings/repository/net.sf/picard-1.49.895.jar similarity index 95% rename from settings/repository/net.sf/picard-1.48.889.jar rename to settings/repository/net.sf/picard-1.49.895.jar index 1b725dde5..3ee1f2090 100644 Binary files a/settings/repository/net.sf/picard-1.48.889.jar and b/settings/repository/net.sf/picard-1.49.895.jar differ diff --git a/settings/repository/net.sf/picard-1.49.895.xml b/settings/repository/net.sf/picard-1.49.895.xml new file mode 100644 index 000000000..52d4900c5 --- /dev/null +++ b/settings/repository/net.sf/picard-1.49.895.xml @@ -0,0 +1,3 @@ + + + diff --git a/settings/repository/net.sf/sam-1.48.889.xml b/settings/repository/net.sf/sam-1.48.889.xml deleted file mode 100644 index 8046a0c02..000000000 --- a/settings/repository/net.sf/sam-1.48.889.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/net.sf/sam-1.48.889.jar b/settings/repository/net.sf/sam-1.49.895.jar similarity index 95% rename from settings/repository/net.sf/sam-1.48.889.jar rename to settings/repository/net.sf/sam-1.49.895.jar index 33ae4aa7d..c55ab0b72 100644 Binary files a/settings/repository/net.sf/sam-1.48.889.jar and b/settings/repository/net.sf/sam-1.49.895.jar differ diff --git a/settings/repository/net.sf/sam-1.49.895.xml b/settings/repository/net.sf/sam-1.49.895.xml new file mode 100644 index 000000000..0436ce881 --- /dev/null +++ b/settings/repository/net.sf/sam-1.49.895.xml @@ -0,0 +1,3 @@ + + +