diff --git a/build.xml b/build.xml
index fe4c7a3f4..438e9c90c 100644
--- a/build.xml
+++ b/build.xml
@@ -79,6 +79,17 @@
+
+
+
+
+
+
+
+
+
+
+
@@ -457,6 +468,26 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -489,6 +520,8 @@
+
+
@@ -957,6 +990,12 @@
+
+
@@ -1050,7 +1089,7 @@
-
+
diff --git a/ivy.xml b/ivy.xml
index 3f3d1c97f..a394aa6d7 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -30,6 +30,9 @@
+
+
+
diff --git a/public/R/src/gsalib/DESCRIPTION b/public/R/src/gsalib/DESCRIPTION
new file mode 100644
index 000000000..6116e8c66
--- /dev/null
+++ b/public/R/src/gsalib/DESCRIPTION
@@ -0,0 +1,10 @@
+Package: gsalib
+Type: Package
+Title: Utility functions
+Version: 1.0
+Date: 2010-10-02
+Author: Kiran Garimella
+Maintainer: Kiran Garimella
+Description: Utility functions for GATK NGS analyses
+License: BSD
+LazyLoad: yes
diff --git a/public/R/src/gsalib/R/gsa.error.R b/public/R/src/gsalib/R/gsa.error.R
new file mode 100644
index 000000000..1c6a56046
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.error.R
@@ -0,0 +1,12 @@
+gsa.error <- function(message) {
+ message("");
+ gsa.message("Error: **********");
+ gsa.message(sprintf("Error: %s", message));
+ gsa.message("Error: **********");
+ message("");
+
+ traceback();
+
+ message("");
+ stop(message, call. = FALSE);
+}
diff --git a/public/R/src/gsalib/R/gsa.getargs.R b/public/R/src/gsalib/R/gsa.getargs.R
new file mode 100644
index 000000000..94613bf93
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.getargs.R
@@ -0,0 +1,116 @@
+.gsa.getargs.usage <- function(argspec, doc) {
+ cargs = commandArgs();
+
+ usage = "Usage:";
+
+ fileIndex = grep("--file=", cargs);
+ if (length(fileIndex) > 0) {
+ progname = gsub("--file=", "", cargs[fileIndex[1]]);
+
+ usage = sprintf("Usage: Rscript %s [arguments]", progname);
+
+ if (!is.na(doc)) {
+ message(sprintf("%s: %s\n", progname, doc));
+ }
+ }
+
+ message(usage);
+
+ for (argname in names(argspec)) {
+ key = argname;
+ defaultValue = 0;
+ doc = "";
+
+ if (is.list(argspec[[argname]])) {
+ defaultValue = argspec[[argname]]$value;
+ doc = argspec[[argname]]$doc;
+ }
+
+ message(sprintf(" -%-10s\t[default: %s]\t%s", key, defaultValue, doc));
+ }
+
+ message("");
+
+ stop(call. = FALSE);
+}
+
+gsa.getargs <- function(argspec, doc = NA) {
+ argsenv = new.env();
+
+ for (argname in names(argspec)) {
+ value = 0;
+ if (is.list(argspec[[argname]])) {
+ value = argspec[[argname]]$value;
+ } else {
+ value = argspec[[argname]];
+ }
+
+ assign(argname, value, envir=argsenv);
+ }
+
+ if (interactive()) {
+ for (argname in names(argspec)) {
+ value = get(argname, envir=argsenv);
+
+ if (is.na(value) | is.null(value)) {
+ if (exists("cmdargs")) {
+ assign(argname, cmdargs[[argname]], envir=argsenv);
+ } else {
+ assign(argname, readline(sprintf("Please enter a value for '%s': ", argname)), envir=argsenv);
+ }
+ } else {
+ assign(argname, value, envir=argsenv);
+ }
+ }
+ } else {
+ cargs = commandArgs(TRUE);
+
+ if (length(cargs) == 0) {
+ .gsa.getargs.usage(argspec, doc);
+ }
+
+ for (i in 1:length(cargs)) {
+ if (length(grep("^-", cargs[i], ignore.case=TRUE)) > 0) {
+ key = gsub("-", "", cargs[i]);
+ value = cargs[i+1];
+
+ if (key == "h" | key == "help") {
+ .gsa.getargs.usage(argspec, doc);
+ }
+
+ if (length(grep("^[\\d\\.e\\+\\-]+$", value, perl=TRUE, ignore.case=TRUE)) > 0) {
+ value = as.numeric(value);
+ }
+
+ assign(key, value, envir=argsenv);
+ }
+ }
+ }
+
+ args = as.list(argsenv);
+
+ isMissingArgs = 0;
+ missingArgs = c();
+
+ for (arg in names(argspec)) {
+ if (is.na(args[[arg]]) | is.null(args[[arg]])) {
+ gsa.warn(sprintf("Value for required argument '-%s' was not specified", arg));
+
+ isMissingArgs = 1;
+ missingArgs = c(missingArgs, arg);
+ }
+ }
+
+ if (isMissingArgs) {
+ gsa.error(
+ paste(
+ "Missing required arguments: -",
+ paste(missingArgs, collapse=" -"),
+ ". Specify -h or -help to this script for a list of available arguments.",
+ sep=""
+ )
+ );
+ }
+
+ args;
+}
diff --git a/public/R/src/gsalib/R/gsa.message.R b/public/R/src/gsalib/R/gsa.message.R
new file mode 100644
index 000000000..a2b909d3d
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.message.R
@@ -0,0 +1,3 @@
+gsa.message <- function(message) {
+ message(sprintf("[gsalib] %s", message));
+}
diff --git a/public/R/src/gsalib/R/gsa.plot.venn.R b/public/R/src/gsalib/R/gsa.plot.venn.R
new file mode 100644
index 000000000..b1353ccc1
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.plot.venn.R
@@ -0,0 +1,50 @@
+gsa.plot.venn <-
+function(a, b, c=0, a_and_b, a_and_c=0, b_and_c=0,
+ col=c("#FF6342", "#63C6DE", "#ADDE63"),
+ pos=c(0.20, 0.20, 0.80, 0.82),
+ debug=0
+ ) {
+ library(png);
+ library(graphics);
+
+ # Set up properties
+ for (i in 1:length(col)) {
+ rgbcol = col2rgb(col[i]);
+ col[i] = sprintf("%02X%02X%02X", rgbcol[1], rgbcol[2], rgbcol[3]);
+ }
+
+ chco = paste(col[1], col[2], col[3], sep=",");
+ chd = paste(a, b, c, a_and_b, a_and_c, b_and_c, sep=",");
+
+ props = c(
+ 'cht=v',
+ 'chs=525x525',
+ 'chds=0,10000000000',
+ paste('chco=', chco, sep=""),
+ paste('chd=t:', chd, sep="")
+ );
+ proplist = paste(props[1], props[2], props[3], props[4], props[5], sep='&');
+
+ # Get the venn diagram (as a temporary file)
+ filename = tempfile("venn");
+ cmd = paste("wget -O ", filename, " 'http://chart.apis.google.com/chart?", proplist, "' > /dev/null 2>&1", sep="");
+
+ if (debug == 1) {
+ print(cmd);
+ }
+ system(cmd);
+
+ # Render the temp png file into a plotting frame
+ a = readPNG(filename);
+
+ plot(0, 0, type="n", xaxt="n", yaxt="n", bty="n", xlim=c(0, 1), ylim=c(0, 1), xlab="", ylab="");
+ if (c == 0 || a >= b) {
+ rasterImage(a, pos[1], pos[2], pos[3], pos[4]);
+ } else {
+ rasterImage(a, 0.37+pos[1], 0.37+pos[2], 0.37+pos[3], 0.37+pos[4], angle=180);
+ }
+
+ # Clean up!
+ unlink(filename);
+}
+
diff --git a/public/R/src/gsalib/R/gsa.read.eval.R b/public/R/src/gsalib/R/gsa.read.eval.R
new file mode 100644
index 000000000..f1d49092b
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.read.eval.R
@@ -0,0 +1,83 @@
+.gsa.attemptToLoadFile <- function(filename) {
+ file = NA;
+
+ if (file.exists(filename) & file.info(filename)$size > 500) {
+ file = read.csv(filename, header=TRUE, comment.char="#");
+ }
+
+ file;
+}
+
+gsa.read.eval <-
+function(evalRoot) {
+ fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
+ fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
+ fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
+ fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
+ fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
+ fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
+ fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
+ fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
+ fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
+ fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
+ fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
+ fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
+ fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
+ fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
+
+ eval = list(
+ AlleleCountStats = NA,
+ CompOverlap = NA,
+ CountVariants = NA,
+ GenotypeConcordance = NA,
+ MetricsByAc = NA,
+ MetricsBySample = NA,
+ Quality_Metrics_by_allele_count = NA,
+ QualityScoreHistogram = NA,
+ SampleStatistics = NA,
+ SampleSummaryStatistics = NA,
+ SimpleMetricsBySample = NA,
+ TiTv = NA,
+ TiTvStats = NA,
+ Variant_Quality_Score = NA,
+
+ CallsetNames = c(),
+ CallsetOnlyNames = c(),
+ CallsetFilteredNames = c()
+ );
+
+ eval$AlleleCountStats = .gsa.attemptToLoadFile(fileAlleleCountStats);
+ eval$CompOverlap = .gsa.attemptToLoadFile(fileCompOverlap);
+ eval$CountVariants = .gsa.attemptToLoadFile(fileCountVariants);
+ eval$GenotypeConcordance = .gsa.attemptToLoadFile(fileGenotypeConcordance);
+ eval$MetricsByAc = .gsa.attemptToLoadFile(fileMetricsByAc);
+ eval$MetricsBySample = .gsa.attemptToLoadFile(fileMetricsBySample);
+ eval$Quality_Metrics_by_allele_count = .gsa.attemptToLoadFile(fileQuality_Metrics_by_allele_count);
+ eval$QualityScoreHistogram = .gsa.attemptToLoadFile(fileQualityScoreHistogram);
+ eval$SampleStatistics = .gsa.attemptToLoadFile(fileSampleStatistics);
+ eval$SampleSummaryStatistics = .gsa.attemptToLoadFile(fileSampleSummaryStatistics);
+ eval$SimpleMetricsBySample = .gsa.attemptToLoadFile(fileSimpleMetricsBySample);
+ eval$TiTv = .gsa.attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
+ eval$TiTvStats = .gsa.attemptToLoadFile(fileTiTvStats);
+ eval$Variant_Quality_Score = .gsa.attemptToLoadFile(fileVariant_Quality_Score);
+
+ uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
+ eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
+ eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
+ eval$CallsetFilteredNames = as.vector(c(
+ paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), sep=""),
+ paste(gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In\\U\\1", eval$CallsetNames[1], perl=TRUE), sep=""))
+ );
+
+ if (!(eval$CallsetFilteredNames[1] %in% unique(eval$TiTv$jexl_expression))) {
+ eval$CallsetFilteredNames[1] = paste("In", eval$CallsetNames[1], "-FilteredIn", eval$CallsetNames[2], sep="");
+ }
+
+ if (!(eval$CallsetFilteredNames[2] %in% unique(eval$TiTv$jexl_expression))) {
+ eval$CallsetFilteredNames[2] = paste("In", eval$CallsetNames[2], "-FilteredIn", eval$CallsetNames[1], sep="");
+ #eval$CallsetFilteredNames[2] = paste(gsub("^(\\w)", "In", eval$CallsetNames[2], perl=TRUE), "-Filtered", gsub("^(\\w)", "In", eval$CallsetNames[1], perl=TRUE), sep="");
+ }
+
+ eval;
+}
+
diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/gsalib/R/gsa.read.gatkreport.R
new file mode 100644
index 000000000..011b5240d
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.read.gatkreport.R
@@ -0,0 +1,103 @@
+# Load a table into the specified environment. Make sure that each new table gets a unique name (this allows one to cat a bunch of tables with the same name together and load them into R without each table overwriting the last.
+.gsa.assignGATKTableToEnvironment <- function(tableName, tableHeader, tableRows, tableEnv) {
+ d = data.frame(tableRows, row.names=NULL, stringsAsFactors=FALSE);
+ colnames(d) = tableHeader;
+
+ for (i in 1:ncol(d)) {
+ v = suppressWarnings(as.numeric(d[,i]));
+
+ if (length(na.omit(as.numeric(v))) == length(d[,i])) {
+ d[,i] = v;
+ }
+ }
+
+ usedNames = ls(envir=tableEnv, pattern=tableName);
+
+ if (length(usedNames) > 0) {
+ tableName = paste(tableName, ".", length(usedNames), sep="");
+ }
+
+ assign(tableName, d, envir=tableEnv);
+}
+
+# Read a fixed width line of text into a list.
+.gsa.splitFixedWidth <- function(line, columnStarts) {
+ splitStartStop <- function(x) {
+ x = substring(x, starts, stops);
+ x = gsub("^[[:space:]]+|[[:space:]]+$", "", x);
+ x;
+ }
+
+ starts = c(1, columnStarts);
+ stops = c(columnStarts - 1, nchar(line));
+
+ sapply(line, splitStartStop)[,1];
+}
+
+# Load all GATKReport tables from a file
+gsa.read.gatkreport <- function(filename) {
+ con = file(filename, "r", blocking = TRUE);
+ lines = readLines(con);
+ close(con);
+
+ tableEnv = new.env();
+
+ tableName = NA;
+ tableHeader = c();
+ tableRows = c();
+ version = NA;
+
+ for (line in lines) {
+ if (length(grep("^##:GATKReport.v", line, ignore.case=TRUE)) > 0) {
+ headerFields = unlist(strsplit(line, "[[:space:]]+"));
+
+ if (!is.na(tableName)) {
+ .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+ }
+
+ tableName = headerFields[2];
+ tableHeader = c();
+ tableRows = c();
+
+ # For differences in versions see
+ # $STING_HOME/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
+ if (length(grep("^##:GATKReport.v0.1[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+ version = "v0.1";
+
+ } else if (length(grep("^##:GATKReport.v0.2[[:space:]]+", line, ignore.case=TRUE)) > 0) {
+ version = "v0.2";
+ columnStarts = c();
+
+ }
+
+ } else if (length(grep("^[[:space:]]*$", line)) > 0 | length(grep("^[[:space:]]*#", line)) > 0) {
+ # do nothing
+ } else if (!is.na(tableName)) {
+
+ if (version == "v0.1") {
+ row = unlist(strsplit(line, "[[:space:]]+"));
+
+ } else if (version == "v0.2") {
+ if (length(tableHeader) == 0) {
+ headerChars = unlist(strsplit(line, ""));
+ # Find the first position of non space characters, excluding the first character
+ columnStarts = intersect(grep("[[:space:]]", headerChars, invert=TRUE), grep("[[:space:]]", headerChars) + 1);
+ }
+
+ row = .gsa.splitFixedWidth(line, columnStarts);
+ }
+
+ if (length(tableHeader) == 0) {
+ tableHeader = row;
+ } else {
+ tableRows = rbind(tableRows, row);
+ }
+ }
+ }
+
+ if (!is.na(tableName)) {
+ .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv);
+ }
+
+ gatkreport = as.list(tableEnv);
+}
diff --git a/public/R/src/gsalib/R/gsa.read.squidmetrics.R b/public/R/src/gsalib/R/gsa.read.squidmetrics.R
new file mode 100644
index 000000000..39fa1ad32
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.read.squidmetrics.R
@@ -0,0 +1,28 @@
+gsa.read.squidmetrics = function(project, bylane = FALSE) {
+ suppressMessages(library(ROracle));
+
+ drv = dbDriver("Oracle");
+ con = dbConnect(drv, "REPORTING/REPORTING@ora01:1521/SEQPROD");
+
+ if (bylane) {
+ statement = paste("SELECT * FROM ILLUMINA_PICARD_METRICS WHERE \"Project\" = '", project, "'", sep="");
+ print(statement);
+
+ rs = dbSendQuery(con, statement = statement);
+ d = fetch(rs, n=-1);
+ dbHasCompleted(rs);
+ dbClearResult(rs);
+ } else {
+ statement = paste("SELECT * FROM ILLUMINA_SAMPLE_STATUS_AGG WHERE \"Project\" = '", project, "'", sep="");
+ print(statement);
+
+ rs = dbSendQuery(con, statement = statement);
+ d = fetch(rs, n=-1);
+ dbHasCompleted(rs);
+ dbClearResult(rs);
+ }
+
+ oraCloseDriver(drv);
+
+ subset(d, Project == project);
+}
diff --git a/public/R/src/gsalib/R/gsa.read.vcf.R b/public/R/src/gsalib/R/gsa.read.vcf.R
new file mode 100644
index 000000000..5beb6455d
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.read.vcf.R
@@ -0,0 +1,23 @@
+gsa.read.vcf <- function(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE) {
+ headers = readLines(vcffile, n=100);
+ headerline = headers[grep("#CHROM", headers)];
+ header = unlist(strsplit(gsub("#", "", headerline), "\t"))
+
+ d = read.table(vcffile, header=FALSE, skip=skip, nrows=nrows, stringsAsFactors=FALSE);
+ colnames(d) = header;
+
+ if (expandGenotypeFields) {
+ columns = ncol(d);
+
+ offset = columns + 1;
+ for (sampleIndex in 10:columns) {
+ gt = unlist(lapply(strsplit(d[,sampleIndex], ":"), function(x) x[1]));
+ d[,offset] = gt;
+ colnames(d)[offset] = sprintf("%s.GT", colnames(d)[sampleIndex]);
+
+ offset = offset + 1;
+ }
+ }
+
+ return(d);
+}
diff --git a/public/R/src/gsalib/R/gsa.warn.R b/public/R/src/gsalib/R/gsa.warn.R
new file mode 100644
index 000000000..7ee08ce65
--- /dev/null
+++ b/public/R/src/gsalib/R/gsa.warn.R
@@ -0,0 +1,3 @@
+gsa.warn <- function(message) {
+ gsa.message(sprintf("Warning: %s", message));
+}
diff --git a/public/R/src/gsalib/Read-and-delete-me b/public/R/src/gsalib/Read-and-delete-me
new file mode 100644
index 000000000..d04323a6e
--- /dev/null
+++ b/public/R/src/gsalib/Read-and-delete-me
@@ -0,0 +1,9 @@
+* Edit the help file skeletons in 'man', possibly combining help files
+ for multiple functions.
+* Put any C/C++/Fortran code in 'src'.
+* If you have compiled code, add a .First.lib() function in 'R' to load
+ the shared library.
+* Run R CMD build to build the package tarball.
+* Run R CMD check to check the package tarball.
+
+Read "Writing R Extensions" for more information.
diff --git a/public/R/src/gsalib/data/tearsheetdrop.jpg b/public/R/src/gsalib/data/tearsheetdrop.jpg
new file mode 100755
index 000000000..c9d480fa0
Binary files /dev/null and b/public/R/src/gsalib/data/tearsheetdrop.jpg differ
diff --git a/public/R/src/gsalib/man/gsa.error.Rd b/public/R/src/gsalib/man/gsa.error.Rd
new file mode 100644
index 000000000..df7c0cbde
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.error.Rd
@@ -0,0 +1,49 @@
+\name{gsa.error}
+\alias{gsa.error}
+\title{
+GSA error
+}
+\description{
+Write an error message to standard out with the prefix '[gsalib] Error:', print a traceback, and exit.
+}
+\usage{
+gsa.error(message)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{message}{
+The error message to write.
+}
+}
+\details{
+%% ~~ If necessary, more details than the description above ~~
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+gsa.error("This is a message");
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
diff --git a/public/R/src/gsalib/man/gsa.getargs.Rd b/public/R/src/gsalib/man/gsa.getargs.Rd
new file mode 100644
index 000000000..27aa1b05a
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.getargs.Rd
@@ -0,0 +1,57 @@
+\name{gsa.getargs}
+\alias{gsa.getargs}
+\title{
+Get script arguments
+}
+\description{
+Get script arguments given a list object specifying arguments and documentation. Can be used in command-line or interactive mode. This is helpful when developing scripts in interactive mode that will eventually become command-line programs. If no arguments are specified or help is requested in command-line mode, the script will print out a usage statement with available arguments and exit.
+}
+\usage{
+gsa.getargs(argspec, doc = NA)
+}
+\arguments{
+ \item{argspec}{
+A list object. Each key is an argument name. The value is another list object with a 'value' and 'doc' keys. For example:
+\preformatted{argspec = list(
+ arg1 = list(value=10, doc="Info for optional arg1"),
+ arg2 = list(value=NA, doc="Info for required arg2")
+);
+}
+
+If the value provided is NA, the argument is considered required and must be specified when the script is invoked. For command-line mode, this means the argument must be specified on the command-line. In interactive mode, there are two ways of specifying these arguments. First, if a properly formatted list argument called 'cmdargs' is present in the current environment (i.e. the object returned by gsa.getargs() from a previous invocation), the value is taken from this object. Otherwise, the argument is prompted for.
+}
+
+ \item{doc}{
+An optional string succinctly documenting the purpose of the script.
+}
+}
+\details{
+Interactive scripts typically make use of hardcoded filepaths and parameter settings. This makes testing easy, but generalization to non-interactive mode more difficult. This utility provides a mechanism for writing scripts that work properly in both interactive and command-line modes.
+
+To use this method, specify a list with key-value pairs representing the arguments as specified above. In command-line mode, if no arguments are specified or the user specifies '-h' or '-help' anywhere on the command string, a help message indicating available arguments, their default values, and some documentation about the argument are provided.
+}
+\value{
+Returns a list with keys matching the argspec and values representing the specified arguments.
+
+\item{arg1 }{Value for argument 1}
+\item{arg2 }{Value for argument 2}
+...etc.
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\examples{
+argspec = list(
+ file = list(value="/my/test.vcf", doc="VCF file"),
+ verbose = list(value=0, doc="If 1, set verbose mode"),
+ test2 = list(value=2.3e9, doc="Another argument that does stuff")
+);
+
+cmdargs = gsa.getargs(argspec, doc="My test program");
+
+print(cmdargs$file); # will print '[1] "/my/test.vcf"'
+}
+\keyword{ ~kwd1 }
diff --git a/public/R/src/gsalib/man/gsa.message.Rd b/public/R/src/gsalib/man/gsa.message.Rd
new file mode 100644
index 000000000..9752de8a9
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.message.Rd
@@ -0,0 +1,44 @@
+\name{gsa.message}
+\alias{gsa.message}
+\title{
+GSA message
+}
+\description{
+Write a message to standard out with the prefix '[gsalib]'.
+}
+\usage{
+gsa.message(message)
+}
+\arguments{
+ \item{message}{
+The message to write.
+}
+}
+\details{
+%% ~~ If necessary, more details than the description above ~~
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Write message to stdout
+gsa.message("This is a message");
+}
+\keyword{ ~kwd1 }
diff --git a/public/R/src/gsalib/man/gsa.plot.venn.Rd b/public/R/src/gsalib/man/gsa.plot.venn.Rd
new file mode 100644
index 000000000..bf4feb5bc
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.plot.venn.Rd
@@ -0,0 +1,75 @@
+\name{gsa.plot.venn}
+\alias{gsa.plot.venn}
+\title{
+Plot a proportional venn diagram
+}
+\description{
+Plot a proportional venn diagram (two or three-way venns allowed)
+}
+\usage{
+gsa.plot.venn(a, b, c = 0, a_and_b, a_and_c = 0, b_and_c = 0, col = c("#FF6342", "#63C6DE", "#ADDE63"), pos = c(0.2, 0.2, 0.8, 0.82), debug = 0)
+}
+\arguments{
+ \item{a}{
+size of 'a' circle
+}
+ \item{b}{
+size of 'b' circle
+}
+ \item{c}{
+size of 'c' circle
+}
+ \item{a_and_b}{
+size of a and b overlap
+}
+ \item{a_and_c}{
+size of a and c overlap
+}
+ \item{b_and_c}{
+size of b and c overlap
+}
+ \item{col}{
+vector of colors for each venn piece
+}
+ \item{pos}{
+vector of positional elements
+}
+ \item{debug}{
+if 1, set debug mode and print useful information
+}
+}
+\details{
+Plots a two-way or three-way proportional Venn diagram. Internally, this method uses the Google Chart API to generate the diagram, then renders it into the plot window where it can be annotated in interesting ways.
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Plot a two-way Venn diagram
+gsa.plot.venn(1000, 750, 0, 400);
+
+## Plot a three-way Venn diagram
+gsa.plot.venn(1000, 750, 900, 400, 650, 500);
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
diff --git a/public/R/src/gsalib/man/gsa.read.eval.Rd b/public/R/src/gsalib/man/gsa.read.eval.Rd
new file mode 100644
index 000000000..0e2baba73
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.read.eval.Rd
@@ -0,0 +1,111 @@
+\name{gsa.read.eval}
+\alias{gsa.read.eval}
+\title{
+Read a VariantEval file
+}
+\description{
+Read a VariantEval file that's output in R format.
+}
+\usage{
+gsa.read.eval(evalRoot)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{evalRoot}{
+%% ~~Describe \code{evalRoot} here~~
+}
+}
+\details{
+%% ~~ If necessary, more details than the description above ~~
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+%% ~~who you are~~
+}
+\note{
+%% ~~further notes~~
+}
+
+%% ~Make other sections like Warning with \section{Warning }{....} ~
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+##---- Should be DIRECTLY executable !! ----
+##-- ==> Define data, use random,
+##-- or do help(data=index) for the standard data sets.
+
+## The function is currently defined as
+function(evalRoot) {
+ fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep="");
+ fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep="");
+ fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep="");
+ fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep="");
+ fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep="");
+ fileMetricsBySample = paste(evalRoot, ".MetricsBySample.csv", sep="");
+ fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep="");
+ fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep="");
+ fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep="");
+ fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep="");
+ fileSimpleMetricsBySample = paste(evalRoot, ".SimpleMetricsBySample.csv", sep="");
+ fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep="");
+ fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep="");
+ fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep="");
+
+ eval = list(
+ AlleleCountStats = NA,
+ CompOverlap = NA,
+ CountVariants = NA,
+ GenotypeConcordance = NA,
+ MetricsByAc = NA,
+ MetricsBySample = NA,
+ Quality_Metrics_by_allele_count = NA,
+ QualityScoreHistogram = NA,
+ SampleStatistics = NA,
+ SampleSummaryStatistics = NA,
+ SimpleMetricsBySample = NA,
+ TiTv = NA,
+ TiTvStats = NA,
+ Variant_Quality_Score = NA,
+
+ CallsetNames = c(),
+ CallsetOnlyNames = c(),
+ CallsetFilteredNames = c()
+ );
+
+ eval$AlleleCountStats = .attemptToLoadFile(fileAlleleCountStats);
+ eval$CompOverlap = .attemptToLoadFile(fileCompOverlap);
+ eval$CountVariants = .attemptToLoadFile(fileCountVariants);
+ eval$GenotypeConcordance = .attemptToLoadFile(fileGenotypeConcordance);
+ eval$MetricsByAc = .attemptToLoadFile(fileMetricsByAc);
+ eval$MetricsBySample = .attemptToLoadFile(fileMetricsBySample);
+ eval$Quality_Metrics_by_allele_count = .attemptToLoadFile(fileQuality_Metrics_by_allele_count);
+ eval$QualityScoreHistogram = .attemptToLoadFile(fileQualityScoreHistogram);
+ eval$SampleStatistics = .attemptToLoadFile(fileSampleStatistics);
+ eval$SampleSummaryStatistics = .attemptToLoadFile(fileSampleSummaryStatistics);
+ eval$SimpleMetricsBySample = .attemptToLoadFile(fileSimpleMetricsBySample);
+ eval$TiTv = .attemptToLoadFile(fileTi_slash_Tv_Variant_Evaluator);
+ eval$TiTvStats = .attemptToLoadFile(fileTiTvStats);
+ eval$Variant_Quality_Score = .attemptToLoadFile(fileVariant_Quality_Score);
+
+ uniqueJexlExpressions = unique(eval$TiTv$jexl_expression);
+ eval$CallsetOnlyNames = as.vector(uniqueJexlExpressions[grep("FilteredIn|Intersection|none", uniqueJexlExpressions, invert=TRUE, ignore.case=TRUE)]);
+ eval$CallsetNames = as.vector(gsub("-only", "", eval$CallsetOnlyNames));
+ eval$CallsetFilteredNames = as.vector(c());
+ eval;
+ }
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
diff --git a/public/R/src/gsalib/man/gsa.read.gatkreport.Rd b/public/R/src/gsalib/man/gsa.read.gatkreport.Rd
new file mode 100644
index 000000000..67c2c7b28
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.read.gatkreport.Rd
@@ -0,0 +1,55 @@
+\name{gsa.read.gatkreport}
+\alias{gsa.read.gatkreport}
+\title{
+gsa.read.gatkreport
+}
+\description{
+Reads a GATKReport file - a multi-table document - and loads each table as a separate data.frame object in a list.
+}
+\usage{
+gsa.read.gatkreport(filename)
+}
+\arguments{
+ \item{filename}{
+The path to the GATKReport file.
+}
+}
+\details{
+The GATKReport format replaces the multi-file output format used by many GATK tools and provides a single, consolidated file format. This format accomodates multiple tables and is still R-loadable - through this function.
+
+The file format looks like this:
+\preformatted{##:GATKReport.v0.1 TableName : The description of the table
+col1 col2 col3
+0 0.007451835696110506 25.474613284804366
+1 0.002362777171937477 29.844949954504095
+2 9.087604507451836E-4 32.87590975254731
+3 5.452562704471102E-4 34.498999090081895
+4 9.087604507451836E-4 35.14831665150137
+}
+
+}
+\value{
+Returns a list object, where each key is the TableName and the value is the data.frame object with the contents of the table. If multiple tables with the same name exist, each one after the first will be given names of "TableName.v1", "TableName.v2", ..., "TableName.vN".
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+report = gsa.read.gatkreport("/path/to/my/output.gatkreport");
+}
+\keyword{ ~kwd1 }
diff --git a/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd b/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd
new file mode 100644
index 000000000..0a8b37843
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd
@@ -0,0 +1,48 @@
+\name{gsa.read.squidmetrics}
+\alias{gsa.read.squidmetrics}
+\title{
+gsa.read.squidmetrics
+}
+\description{
+Reads metrics for a specified SQUID project into a dataframe.
+}
+\usage{
+gsa.read.squidmetrics("C315")
+}
+\arguments{
+ \item{project}{
+The project for which metrics should be obtained.
+}
+ \item{bylane}{
+If TRUE, obtains per-lane metrics rather than the default per-sample metrics.
+}
+}
+\details{
+%% ~~ If necessary, more details than the description above ~~
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+Returns a data frame with samples (or lanes) as the row and the metric as the column.
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+This method will only work within the Broad Institute internal network.
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Obtain metrics for project C315.
+d = gsa.read.squidmetrics("C315");
+}
+\keyword{ ~kwd1 }
diff --git a/public/R/src/gsalib/man/gsa.read.vcf.Rd b/public/R/src/gsalib/man/gsa.read.vcf.Rd
new file mode 100644
index 000000000..cffd35e8f
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.read.vcf.Rd
@@ -0,0 +1,53 @@
+\name{gsa.read.vcf}
+\alias{gsa.read.vcf}
+\title{
+gsa.read.vcf
+}
+\description{
+Reads a VCF file into a table. Optionally expands genotype columns into separate columns containing the genotype, separate from the other fields specified in the FORMAT field.
+}
+\usage{
+gsa.read.vcf(vcffile, skip=0, nrows=-1, expandGenotypeFields = FALSE)
+}
+\arguments{
+ \item{vcffile}{
+The path to the vcf file.
+}
+ \item{skip}{
+The number of lines of the data file to skip before beginning to read data.
+}
+ \item{nrows}{
+The maximum number of rows to read in. Negative and other invalid values are ignored.
+}
+ \item{expandGenotypeFields}{
+If TRUE, adds an additional column per sample containing just the genotype.
+}
+}
+\details{
+The VCF format is the standard variant call file format used in the GATK. This function reads that data in as a table for easy analysis.
+}
+\value{
+Returns a data.frame object, where each column corresponds to the columns in the VCF file.
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+vcf = gsa.read.vcf("/path/to/my/output.vcf");
+}
+\keyword{ ~kwd1 }
diff --git a/public/R/src/gsalib/man/gsa.warn.Rd b/public/R/src/gsalib/man/gsa.warn.Rd
new file mode 100644
index 000000000..0b9770b5c
--- /dev/null
+++ b/public/R/src/gsalib/man/gsa.warn.Rd
@@ -0,0 +1,46 @@
+\name{gsa.warn}
+\alias{gsa.warn}
+\title{
+GSA warn
+}
+\description{
+Write a warning message to standard out with the prefix '[gsalib] Warning:'.
+}
+\usage{
+gsa.warn(message)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{message}{
+The warning message to write.
+}
+}
+\details{
+%% ~~ If necessary, more details than the description above ~~
+}
+\value{
+%% ~Describe the value returned
+%% If it is a LIST, use
+%% \item{comp1 }{Description of 'comp1'}
+%% \item{comp2 }{Description of 'comp2'}
+%% ...
+}
+\references{
+%% ~put references to the literature/web site here ~
+}
+\author{
+Kiran Garimella
+}
+\note{
+%% ~~further notes~~
+}
+
+\seealso{
+%% ~~objects to See Also as \code{\link{help}}, ~~~
+}
+\examples{
+## Write message to stdout
+gsa.warn("This is a warning message");
+}
+\keyword{ ~kwd1 }
+\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
diff --git a/public/R/src/gsalib/man/gsalib-package.Rd b/public/R/src/gsalib/man/gsalib-package.Rd
new file mode 100644
index 000000000..2b8d6db9f
--- /dev/null
+++ b/public/R/src/gsalib/man/gsalib-package.Rd
@@ -0,0 +1,68 @@
+\name{gsalib-package}
+\alias{gsalib-package}
+\alias{gsalib}
+\docType{package}
+\title{
+GATK utility analysis functions
+}
+\description{
+Utility functions for analyzing GATK-processed NGS data
+}
+\details{
+This package contains functions for working with GATK-processed NGS data. These functions include a command-line parser that also allows a script to be used in interactive mode (good for developing scripts that will eventually be automated), a proportional Venn diagram generator, convenience methods for parsing VariantEval output, and more.
+}
+\author{
+Genome Sequencing and Analysis Group
+
+Medical and Population Genetics Program
+
+Maintainer: Kiran Garimella
+}
+\references{
+GSA wiki page: http://www.broadinstitute.org/gsa/wiki
+
+GATK help forum: http://www.getsatisfaction.com/gsa
+}
+\examples{
+## get script arguments in interactive and non-interactive mode
+cmdargs = gsa.getargs( list(
+ requiredArg1 = list(
+ value = NA,
+ doc = "Documentation for requiredArg1"
+ ),
+
+ optionalArg1 = list(
+ value = 3e9,
+ doc = "Documentation for optionalArg1"
+ )
+) );
+
+## plot a proportional Venn diagram
+gsa.plot.venn(500, 250, 0, 100);
+
+## read a GATKReport file
+report = gsa.gatk.report("/path/to/my/output.gatkreport");
+
+## emit a message
+gsa.message("This is a message");
+
+## emit a warning message
+gsa.message("This is a warning message");
+
+## emit an error message
+gsa.message("This is an error message");
+
+## read the SQUID metrics for a given sequencing project (internal to the Broad only)
+s = gsa.read.squidmetrics("C427");
+
+## read command-line arguments
+cmdargs = gsa.getargs(
+ list(
+ file = list(value="/my/test.vcf", doc="VCF file"),
+ verbose = list(value=0, doc="If 1, set verbose mode"),
+ test2 = list(value=2.3e9, doc="Another argument that does stuff")
+ ),
+ doc="My test program"
+);
+}
+\keyword{ package }
diff --git a/public/c/SeparateQltout.cc b/public/c/SeparateQltout.cc
new file mode 100644
index 000000000..7644c9603
--- /dev/null
+++ b/public/c/SeparateQltout.cc
@@ -0,0 +1,70 @@
+#include "MainTools.h"
+#include "Basevector.h"
+#include "lookup/LookAlign.h"
+#include "lookup/SerialQltout.h"
+
+unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) {
+ //la.PrintParseable(cout);
+
+ for (int i = 0; i < candidates.size(); i++) {
+ look_align newla = la;
+
+ if (newla.rc1) { candidates[i].ReverseComplement(); }
+ newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]);
+
+ //newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]);
+ //cout << newla.Errors() << " " << la.Errors() << endl;
+
+ if (newla.Errors() == la.Errors()) {
+ return i;
+ }
+ }
+
+ //FatalErr("Query id " + ToString(la.query_id) + " had no matches.");
+
+ return candidates.size() + 1;
+}
+
+int main(int argc, char **argv) {
+ RunTime();
+
+ BeginCommandArguments;
+ CommandArgument_String(ALIGNS);
+ CommandArgument_String(FASTB_END_1);
+ CommandArgument_String(FASTB_END_2);
+ CommandArgument_String(REFERENCE);
+
+ CommandArgument_String(ALIGNS_END_1_OUT);
+ CommandArgument_String(ALIGNS_END_2_OUT);
+ EndCommandArguments;
+
+ vecbasevector ref(REFERENCE);
+ vecbasevector reads1(FASTB_END_1);
+ vecbasevector reads2(FASTB_END_2);
+
+ ofstream aligns1stream(ALIGNS_END_1_OUT.c_str());
+ ofstream aligns2stream(ALIGNS_END_2_OUT.c_str());
+
+ basevector bv;
+
+ SerialQltout sqltout(ALIGNS);
+ look_align la;
+ while (sqltout.Next(la)) {
+ vecbasevector candidates(2);
+ candidates[0] = reads1[la.query_id];
+ candidates[1] = reads2[la.query_id];
+
+ unsigned int matchingend = MatchingEnd(la, candidates, ref);
+ if (matchingend < 2) {
+ bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id];
+
+ //la.PrintParseable(cout, &bv, &ref[la.target_id]);
+ la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]);
+ }
+ }
+
+ aligns1stream.close();
+ aligns2stream.close();
+
+ return 0;
+}
diff --git a/public/c/bwa/Makefile b/public/c/bwa/Makefile
new file mode 100644
index 000000000..6399a0e6d
--- /dev/null
+++ b/public/c/bwa/Makefile
@@ -0,0 +1,21 @@
+CXX=g++
+CXXFLAGS=-g -Wall -O2 -m64 -fPIC
+
+.cpp.o:
+ $(CXX) -c $(CXXFLAGS) -I$(BWA_HOME) -I$(JAVA_INCLUDE) $< -o $@
+
+all: init lib
+
+init:
+ @echo Please make sure the following platforms are set correctly on your machine.
+ @echo BWA_HOME=$(BWA_HOME)
+ @echo JAVA_INCLUDE=$(JAVA_INCLUDE)
+ @echo TARGET_LIB=$(TARGET_LIB)
+ @echo EXTRA_LIBS=$(EXTRA_LIBS)
+ @echo LIBTOOL_COMMAND=$(LIBTOOL_COMMAND)
+
+lib: org_broadinstitute_sting_alignment_bwa_c_BWACAligner.o bwa_gateway.o
+ $(LIBTOOL_COMMAND) $? -o $(TARGET_LIB) -L$(BWA_HOME) -lbwacore $(EXTRA_LIBS)
+
+clean:
+ rm *.o libbwa.*
diff --git a/public/c/bwa/build_linux.sh b/public/c/bwa/build_linux.sh
new file mode 100755
index 000000000..b3631a28d
--- /dev/null
+++ b/public/c/bwa/build_linux.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+export BWA_HOME="/humgen/gsa-scr1/hanna/src/bwa-trunk/bwa"
+export JAVA_INCLUDE="/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include -I/broad/tools/Linux/x86_64/pkgs/jdk_1.6.0_12/include/linux"
+export TARGET_LIB="libbwa.so"
+export EXTRA_LIBS="-lc -lz -lstdc++ -lpthread"
+export LIBTOOL_COMMAND="g++ -shared -Wl,-soname,libbwa.so"
+make
diff --git a/public/c/bwa/build_mac.sh b/public/c/bwa/build_mac.sh
new file mode 100644
index 000000000..bfed900bb
--- /dev/null
+++ b/public/c/bwa/build_mac.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+export BWA_HOME="/Users/mhanna/src/bwa"
+export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers"
+export TARGET_LIB="libbwa.dylib"
+export EXTRA_LIBS="-lc -lz -lsupc++"
+export LIBTOOL_COMMAND="libtool -dynamic"
+make
diff --git a/public/c/bwa/bwa_gateway.cpp b/public/c/bwa/bwa_gateway.cpp
new file mode 100644
index 000000000..00f5aa5bc
--- /dev/null
+++ b/public/c/bwa/bwa_gateway.cpp
@@ -0,0 +1,277 @@
+#include
+#include
+#include
+
+#include "bwase.h"
+#include "bwa_gateway.h"
+
+BWA::BWA(const char* ann_filename,
+ const char* amb_filename,
+ const char* pac_filename,
+ const char* forward_bwt_filename,
+ const char* forward_sa_filename,
+ const char* reverse_bwt_filename,
+ const char* reverse_sa_filename)
+{
+ // Load the bns (?) and reference
+ bns = bns_restore_core(ann_filename,amb_filename,pac_filename);
+ reference = new ubyte_t[bns->l_pac/4+1];
+ rewind(bns->fp_pac);
+ fread(reference, 1, bns->l_pac/4+1, bns->fp_pac);
+ fclose(bns->fp_pac);
+ bns->fp_pac = NULL;
+
+ // Load the BWTs (both directions) and suffix arrays (both directions)
+ bwts[0] = bwt_restore_bwt(forward_bwt_filename);
+ bwt_restore_sa(forward_sa_filename, bwts[0]);
+ bwts[1] = bwt_restore_bwt(reverse_bwt_filename);
+ bwt_restore_sa(reverse_sa_filename, bwts[1]);
+ load_default_options();
+
+ // Always reinitialize the random seed whenever a new set of files are loaded.
+ initialize_random_seed();
+
+ // initialize the bwase subsystem
+ bwase_initialize();
+}
+
+BWA::~BWA() {
+ delete[] reference;
+ bns_destroy(bns);
+ bwt_destroy(bwts[0]);
+ bwt_destroy(bwts[1]);
+}
+
+void BWA::find_paths(const char* bases, const unsigned read_length, bwt_aln1_t*& paths, unsigned& num_paths, unsigned& best_path_count, unsigned& second_best_path_count)
+{
+ bwa_seq_t* sequence = create_sequence(bases, read_length);
+
+ // Calculate the suffix array interval for each sequence, storing the result in sequence->aln (and sequence->n_aln).
+ // This method will destroy the contents of seq and rseq.
+ bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
+
+ paths = new bwt_aln1_t[sequence->n_aln];
+ memcpy(paths,sequence->aln,sequence->n_aln*sizeof(bwt_aln1_t));
+ num_paths = sequence->n_aln;
+
+ // Call aln2seq to initialize the type of match present.
+ bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
+ best_path_count = sequence->c1;
+ second_best_path_count = sequence->c2;
+
+ bwa_free_read_seq(1,sequence);
+}
+
+Alignment* BWA::generate_single_alignment(const char* bases, const unsigned read_length) {
+ bwa_seq_t* sequence = create_sequence(bases,read_length);
+
+ // Calculate paths.
+ bwa_cal_sa_reg_gap(0,bwts,1,sequence,&options);
+
+ // Check for no alignments found and return null.
+ if(sequence->n_aln == 0) {
+ bwa_free_read_seq(1,sequence);
+ return NULL;
+ }
+
+ // bwa_cal_sa_reg_gap destroys the bases / read length. Copy them back in.
+ copy_bases_into_sequence(sequence,bases,read_length);
+
+ // Pick best alignment and propagate its information into the sequence.
+ bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
+
+ // Generate the best alignment from the sequence.
+ Alignment* alignment = new Alignment;
+ *alignment = generate_final_alignment_from_sequence(sequence);
+
+ bwa_free_read_seq(1,sequence);
+
+ return alignment;
+}
+
+void BWA::generate_alignments_from_paths(const char* bases,
+ const unsigned read_length,
+ bwt_aln1_t* paths,
+ const unsigned num_paths,
+ const unsigned best_count,
+ const unsigned second_best_count,
+ Alignment*& alignments,
+ unsigned& num_alignments)
+{
+ bwa_seq_t* sequence = create_sequence(bases,read_length);
+
+ sequence->aln = paths;
+ sequence->n_aln = num_paths;
+
+ // (Ab)use bwa_aln2seq to propagate values stored in the path out into the sequence itself.
+ bwa_aln2seq(sequence->n_aln,sequence->aln,sequence);
+
+ // But overwrite key parts of the sequence in case the user passed back only a smaller subset
+ // of the paths.
+ sequence->c1 = best_count;
+ sequence->c2 = second_best_count;
+ sequence->type = sequence->c1 > 1 ? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
+
+ num_alignments = 0;
+ for(unsigned i = 0; i < (unsigned)sequence->n_aln; i++)
+ num_alignments += (sequence->aln + i)->l - (sequence->aln + i)->k + 1;
+
+ alignments = new Alignment[num_alignments];
+ unsigned alignment_idx = 0;
+
+ for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
+ // Stub in a 'working' path, so that only the desired alignment is local-aligned.
+ const bwt_aln1_t* path = paths + path_idx;
+ bwt_aln1_t working_path = *path;
+
+ // Loop through all alignments, aligning each one individually.
+ for(unsigned sa_idx = path->k; sa_idx <= path->l; sa_idx++) {
+ working_path.k = working_path.l = sa_idx;
+ sequence->aln = &working_path;
+ sequence->n_aln = 1;
+
+ sequence->sa = sa_idx;
+ sequence->strand = path->a;
+ sequence->score = path->score;
+
+ // Each time through bwa_refine_gapped, seq gets reversed. Revert the reverse.
+ // TODO: Fix the interface to bwa_refine_gapped so its easier to work with.
+ if(alignment_idx > 0)
+ seq_reverse(sequence->len, sequence->seq, 0);
+
+ // Copy the local alignment data into the alignment object.
+ *(alignments + alignment_idx) = generate_final_alignment_from_sequence(sequence);
+
+ alignment_idx++;
+ }
+ }
+
+ sequence->aln = NULL;
+ sequence->n_aln = 0;
+
+ bwa_free_read_seq(1,sequence);
+}
+
+Alignment BWA::generate_final_alignment_from_sequence(bwa_seq_t* sequence) {
+ // Calculate the local coordinate and local alignment.
+ bwa_cal_pac_pos_core(bwts[0],bwts[1],sequence,options.max_diff,options.fnr);
+ bwa_refine_gapped(bns, 1, sequence, reference, NULL);
+
+ // Copy the local alignment data into the alignment object.
+ Alignment alignment;
+
+ // Populate basic path info
+ alignment.edit_distance = sequence->nm;
+ alignment.num_mismatches = sequence->n_mm;
+ alignment.num_gap_opens = sequence->n_gapo;
+ alignment.num_gap_extensions = sequence->n_gape;
+ alignment.num_best = sequence->c1;
+ alignment.num_second_best = sequence->c2;
+
+ // Final alignment position.
+ alignment.type = sequence->type;
+ bns_coor_pac2real(bns, sequence->pos, pos_end(sequence) - sequence->pos, &alignment.contig);
+ alignment.pos = sequence->pos - bns->anns[alignment.contig].offset + 1;
+ alignment.negative_strand = sequence->strand;
+ alignment.mapping_quality = sequence->mapQ;
+
+ // Cigar step.
+ alignment.cigar = NULL;
+ if(sequence->cigar) {
+ alignment.cigar = new uint16_t[sequence->n_cigar];
+ memcpy(alignment.cigar,sequence->cigar,sequence->n_cigar*sizeof(uint16_t));
+ }
+ alignment.n_cigar = sequence->n_cigar;
+
+ // MD tag with a better breakdown of differences in the cigar
+ alignment.md = strdup(sequence->md);
+ delete[] sequence->md;
+ sequence->md = NULL;
+
+ return alignment;
+}
+
+void BWA::load_default_options()
+{
+ options.s_mm = 3;
+ options.s_gapo = 11;
+ options.s_gape = 4;
+ options.mode = 3;
+ options.indel_end_skip = 5;
+ options.max_del_occ = 10;
+ options.max_entries = 2000000;
+ options.fnr = 0.04;
+ options.max_diff = -1;
+ options.max_gapo = 1;
+ options.max_gape = 6;
+ options.max_seed_diff = 2;
+ options.seed_len = 2147483647;
+ options.n_threads = 1;
+ options.max_top2 = 30;
+ options.trim_qual = 0;
+}
+
+void BWA::initialize_random_seed()
+{
+ srand48(bns->seed);
+}
+
+void BWA::set_max_edit_distance(float edit_distance) {
+ if(edit_distance > 0 && edit_distance < 1) {
+ options.fnr = edit_distance;
+ options.max_diff = -1;
+ }
+ else {
+ options.fnr = -1.0;
+ options.max_diff = (int)edit_distance;
+ }
+}
+
+void BWA::set_max_gap_opens(int max_gap_opens) { options.max_gapo = max_gap_opens; }
+void BWA::set_max_gap_extensions(int max_gap_extensions) { options.max_gape = max_gap_extensions; }
+void BWA::set_disallow_indel_within_range(int indel_range) { options.indel_end_skip = indel_range; }
+void BWA::set_mismatch_penalty(int penalty) { options.s_mm = penalty; }
+void BWA::set_gap_open_penalty(int penalty) { options.s_gapo = penalty; }
+void BWA::set_gap_extension_penalty(int penalty) { options.s_gape = penalty; }
+
+/**
+ * Create a sequence with a set of reasonable initial defaults.
+ * Will leave seq and rseq empty.
+ */
+bwa_seq_t* BWA::create_sequence(const char* bases, const unsigned read_length)
+{
+ bwa_seq_t* sequence = new bwa_seq_t;
+
+ sequence->tid = -1;
+
+ sequence->name = 0;
+
+ copy_bases_into_sequence(sequence, bases, read_length);
+
+ sequence->qual = 0;
+ sequence->aln = 0;
+ sequence->md = 0;
+
+ sequence->cigar = NULL;
+ sequence->n_cigar = 0;
+
+ sequence->multi = NULL;
+ sequence->n_multi = 0;
+
+ return sequence;
+}
+
+void BWA::copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length)
+{
+ // seq, rseq will ultimately be freed by bwa_cal_sa_reg_gap
+ sequence->seq = new ubyte_t[read_length];
+ sequence->rseq = new ubyte_t[read_length];
+ for(unsigned i = 0; i < read_length; i++) sequence->seq[i] = nst_nt4_table[(unsigned)bases[i]];
+ memcpy(sequence->rseq,sequence->seq,read_length);
+
+ // BWA expects the read bases to arrive reversed.
+ seq_reverse(read_length,sequence->seq,0);
+ seq_reverse(read_length,sequence->rseq,1);
+
+ sequence->full_len = sequence->len = read_length;
+}
diff --git a/public/c/bwa/bwa_gateway.h b/public/c/bwa/bwa_gateway.h
new file mode 100644
index 000000000..2d26ec650
--- /dev/null
+++ b/public/c/bwa/bwa_gateway.h
@@ -0,0 +1,83 @@
+#ifndef BWA_GATEWAY
+#define BWA_GATEWAY
+
+#include
+
+#include "bntseq.h"
+#include "bwt.h"
+#include "bwtaln.h"
+
+class Alignment {
+ public:
+ uint32_t type;
+ int contig;
+ bwtint_t pos;
+ bool negative_strand;
+ uint32_t mapping_quality;
+
+ uint16_t *cigar;
+ int n_cigar;
+
+ uint8_t num_mismatches;
+ uint8_t num_gap_opens;
+ uint8_t num_gap_extensions;
+ uint16_t edit_distance;
+
+ uint32_t num_best;
+ uint32_t num_second_best;
+
+ char* md;
+};
+
+class BWA {
+ private:
+ bntseq_t *bns;
+ ubyte_t* reference;
+ bwt_t* bwts[2];
+ gap_opt_t options;
+
+ void load_default_options();
+ void initialize_random_seed();
+ bwa_seq_t* create_sequence(const char* bases, const unsigned read_length);
+ void copy_bases_into_sequence(bwa_seq_t* sequence, const char* bases, const unsigned read_length);
+ Alignment generate_final_alignment_from_sequence(bwa_seq_t* sequence);
+
+ public:
+ BWA(const char* ann_filename,
+ const char* amb_filename,
+ const char* pac_filename,
+ const char* forward_bwt_filename,
+ const char* forward_sa_filename,
+ const char* reverse_bwt_filename,
+ const char* reverse_sa_filename);
+ ~BWA();
+
+ // Parameterize the aligner.
+ void set_max_edit_distance(float edit_distance);
+ void set_max_gap_opens(int max_gap_opens);
+ void set_max_gap_extensions(int max_gap_extensions);
+ void set_disallow_indel_within_range(int indel_range);
+ void set_mismatch_penalty(int penalty);
+ void set_gap_open_penalty(int penalty);
+ void set_gap_extension_penalty(int penalty);
+
+ // Perform the alignment
+ Alignment* generate_single_alignment(const char* bases,
+ const unsigned read_length);
+ void find_paths(const char* bases,
+ const unsigned read_length,
+ bwt_aln1_t*& paths,
+ unsigned& num_paths,
+ unsigned& best_path_count,
+ unsigned& second_best_path_count);
+ void generate_alignments_from_paths(const char* bases,
+ const unsigned read_length,
+ bwt_aln1_t* paths,
+ const unsigned num_paths,
+ const unsigned best_count,
+ const unsigned second_best_count,
+ Alignment*& alignments,
+ unsigned& num_alignments);
+};
+
+#endif // BWA_GATEWAY
diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp
new file mode 100644
index 000000000..1ccbef0d4
--- /dev/null
+++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.cpp
@@ -0,0 +1,437 @@
+#include
+#include
+#include
+
+#include "bntseq.h"
+#include "bwt.h"
+#include "bwtaln.h"
+#include "bwa_gateway.h"
+#include "org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h"
+
+typedef void (BWA::*int_setter)(int value);
+typedef void (BWA::*float_setter)(float value);
+
+static jobject convert_to_java_alignment(JNIEnv* env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment);
+static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name);
+static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter);
+static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter);
+static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message);
+
+JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create(JNIEnv* env, jobject instance, jobject bwtFiles, jobject configuration)
+{
+ jstring java_ann = get_configuration_file(env,bwtFiles,"annFile");
+ if(java_ann == NULL) return 0L;
+ jstring java_amb = get_configuration_file(env,bwtFiles,"ambFile");
+ if(java_amb == NULL) return 0L;
+ jstring java_pac = get_configuration_file(env,bwtFiles,"pacFile");
+ if(java_pac == NULL) return 0L;
+ jstring java_forward_bwt = get_configuration_file(env,bwtFiles,"forwardBWTFile");
+ if(java_forward_bwt == NULL) return 0L;
+ jstring java_forward_sa = get_configuration_file(env,bwtFiles,"forwardSAFile");
+ if(java_forward_sa == NULL) return 0L;
+ jstring java_reverse_bwt = get_configuration_file(env,bwtFiles,"reverseBWTFile");
+ if(java_reverse_bwt == NULL) return 0L;
+ jstring java_reverse_sa = get_configuration_file(env,bwtFiles,"reverseSAFile");
+ if(java_reverse_sa == NULL) return 0L;
+
+ const char* ann_filename = env->GetStringUTFChars(java_ann,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* amb_filename = env->GetStringUTFChars(java_amb,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* pac_filename = env->GetStringUTFChars(java_pac,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* forward_bwt_filename = env->GetStringUTFChars(java_forward_bwt,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* forward_sa_filename = env->GetStringUTFChars(java_forward_sa,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* reverse_bwt_filename = env->GetStringUTFChars(java_reverse_bwt,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+ const char* reverse_sa_filename = env->GetStringUTFChars(java_reverse_sa,JNI_FALSE);
+ if(env->ExceptionCheck()) return 0L;
+
+ BWA* bwa = new BWA(ann_filename,
+ amb_filename,
+ pac_filename,
+ forward_bwt_filename,
+ forward_sa_filename,
+ reverse_bwt_filename,
+ reverse_sa_filename);
+
+ Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(env,instance,(jlong)bwa,configuration);
+ if(env->ExceptionCheck()) return 0L;
+
+ env->ReleaseStringUTFChars(java_ann,ann_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_amb,amb_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_pac,pac_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_forward_bwt,forward_bwt_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_forward_sa,forward_sa_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_reverse_bwt,reverse_bwt_filename);
+ if(env->ExceptionCheck()) return 0L;
+ env->ReleaseStringUTFChars(java_reverse_sa,reverse_sa_filename);
+ if(env->ExceptionCheck()) return 0L;
+
+ return (jlong)bwa;
+}
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy(JNIEnv* env, jobject instance, jlong java_bwa)
+{
+ BWA* bwa = (BWA*)java_bwa;
+ delete bwa;
+}
+
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration(JNIEnv *env, jobject instance, jlong java_bwa, jobject configuration) {
+ BWA* bwa = (BWA*)java_bwa;
+ set_float_configuration_param(env, configuration, "maximumEditDistance", bwa, &BWA::set_max_edit_distance);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "maximumGapOpens", bwa, &BWA::set_max_gap_opens);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "maximumGapExtensions", bwa, &BWA::set_max_gap_extensions);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "disallowIndelWithinRange", bwa, &BWA::set_disallow_indel_within_range);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "mismatchPenalty", bwa, &BWA::set_mismatch_penalty);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "gapOpenPenalty", bwa, &BWA::set_gap_open_penalty);
+ if(env->ExceptionCheck()) return;
+ set_int_configuration_param(env, configuration, "gapExtensionPenalty", bwa, &BWA::set_gap_extension_penalty);
+ if(env->ExceptionCheck()) return;
+}
+
+JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases)
+{
+ BWA* bwa = (BWA*)java_bwa;
+
+ const jsize read_length = env->GetArrayLength(java_bases);
+ if(env->ExceptionCheck()) return NULL;
+
+ jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
+ if(read_bases == NULL) return NULL;
+
+ bwt_aln1_t* paths = NULL;
+ unsigned num_paths = 0;
+
+ unsigned best_path_count, second_best_path_count;
+ bwa->find_paths((const char*)read_bases,read_length,paths,num_paths,best_path_count,second_best_path_count);
+
+ jobjectArray java_paths = env->NewObjectArray(num_paths, env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath"), NULL);
+ if(java_paths == NULL) return NULL;
+
+ for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
+ bwt_aln1_t& path = *(paths + path_idx);
+
+ jclass java_path_class = env->FindClass("org/broadinstitute/sting/alignment/bwa/c/BWAPath");
+ if(java_path_class == NULL) return NULL;
+
+ jmethodID java_path_constructor = env->GetMethodID(java_path_class, "", "(IIIZJJIII)V");
+ if(java_path_constructor == NULL) return NULL;
+
+ // Note that k/l are being cast to long. Bad things will happen if JNI assumes that they're ints.
+ jobject java_path = env->NewObject(java_path_class,
+ java_path_constructor,
+ path.n_mm,
+ path.n_gapo,
+ path.n_gape,
+ path.a,
+ (jlong)path.k,
+ (jlong)path.l,
+ path.score,
+ best_path_count,
+ second_best_path_count);
+ if(java_path == NULL) return NULL;
+
+ env->SetObjectArrayElement(java_paths,path_idx,java_path);
+ if(env->ExceptionCheck()) return NULL;
+
+ env->DeleteLocalRef(java_path_class);
+ if(env->ExceptionCheck()) return NULL;
+ }
+
+ delete[] paths;
+
+ env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
+
+ return env->ExceptionCheck() ? NULL : java_paths;
+}
+
+JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases, jobjectArray java_paths)
+{
+ BWA* bwa = (BWA*)java_bwa;
+
+ const jsize read_length = env->GetArrayLength(java_bases);
+ if(env->ExceptionCheck()) return NULL;
+
+ jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
+ if(read_bases == NULL) return NULL;
+
+ const jsize num_paths = env->GetArrayLength(java_paths);
+ bwt_aln1_t* paths = new bwt_aln1_t[num_paths];
+ unsigned best_count = 0, second_best_count = 0;
+
+ for(unsigned path_idx = 0; path_idx < (unsigned)num_paths; path_idx++) {
+ jobject java_path = env->GetObjectArrayElement(java_paths,path_idx);
+ jclass java_path_class = env->GetObjectClass(java_path);
+ if(java_path_class == NULL) return NULL;
+
+ bwt_aln1_t& path = *(paths + path_idx);
+
+ jfieldID mismatches_field = env->GetFieldID(java_path_class, "numMismatches", "I");
+ if(mismatches_field == NULL) return NULL;
+ path.n_mm = env->GetIntField(java_path,mismatches_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID gap_opens_field = env->GetFieldID(java_path_class, "numGapOpens", "I");
+ if(gap_opens_field == NULL) return NULL;
+ path.n_gapo = env->GetIntField(java_path,gap_opens_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID gap_extensions_field = env->GetFieldID(java_path_class, "numGapExtensions", "I");
+ if(gap_extensions_field == NULL) return NULL;
+ path.n_gape = env->GetIntField(java_path,gap_extensions_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID negative_strand_field = env->GetFieldID(java_path_class, "negativeStrand", "Z");
+ if(negative_strand_field == NULL) return NULL;
+ path.a = env->GetBooleanField(java_path,negative_strand_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID k_field = env->GetFieldID(java_path_class, "k", "J");
+ if(k_field == NULL) return NULL;
+ path.k = env->GetLongField(java_path,k_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID l_field = env->GetFieldID(java_path_class, "l", "J");
+ if(l_field == NULL) return NULL;
+ path.l = env->GetLongField(java_path,l_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID score_field = env->GetFieldID(java_path_class, "score", "I");
+ if(score_field == NULL) return NULL;
+ path.score = env->GetIntField(java_path,score_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID best_count_field = env->GetFieldID(java_path_class, "bestCount", "I");
+ if(best_count_field == NULL) return NULL;
+ best_count = env->GetIntField(java_path,best_count_field);
+ if(env->ExceptionCheck()) return NULL;
+
+ jfieldID second_best_count_field = env->GetFieldID(java_path_class, "secondBestCount", "I");
+ if(second_best_count_field == NULL) return NULL;
+ second_best_count = env->GetIntField(java_path,second_best_count_field);
+ if(env->ExceptionCheck()) return NULL;
+ }
+
+ Alignment* alignments = NULL;
+ unsigned num_alignments = 0;
+ bwa->generate_alignments_from_paths((const char*)read_bases,read_length,paths,num_paths,best_count,second_best_count,alignments,num_alignments);
+
+ jobjectArray java_alignments = env->NewObjectArray(num_alignments, env->FindClass("org/broadinstitute/sting/alignment/Alignment"), NULL);
+ if(java_alignments == NULL) return NULL;
+
+ for(unsigned alignment_idx = 0; alignment_idx < (unsigned)num_alignments; alignment_idx++) {
+ Alignment& alignment = *(alignments + alignment_idx);
+ jobject java_alignment = convert_to_java_alignment(env,read_bases,read_length,alignment);
+ if(java_alignment == NULL) return NULL;
+ env->SetObjectArrayElement(java_alignments,alignment_idx,java_alignment);
+ if(env->ExceptionCheck()) return NULL;
+ }
+
+ delete[] alignments;
+ delete[] paths;
+
+ env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
+
+ return env->ExceptionCheck() ? NULL : java_alignments;
+}
+
+JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment(JNIEnv *env, jobject instance, jlong java_bwa, jbyteArray java_bases) {
+ BWA* bwa = (BWA*)java_bwa;
+
+ const jsize read_length = env->GetArrayLength(java_bases);
+ if(env->ExceptionCheck()) return NULL;
+
+ jbyte *read_bases = env->GetByteArrayElements(java_bases,JNI_FALSE);
+ if(read_bases == NULL) return NULL;
+
+ Alignment* best_alignment = bwa->generate_single_alignment((const char*)read_bases,read_length);
+ jobject java_best_alignment = (best_alignment != NULL) ? convert_to_java_alignment(env,read_bases,read_length,*best_alignment) : NULL;
+ delete best_alignment;
+
+ env->ReleaseByteArrayElements(java_bases,read_bases,JNI_FALSE);
+
+ return java_best_alignment;
+}
+
+static jobject convert_to_java_alignment(JNIEnv *env, const jbyte* read_bases, const jsize read_length, const Alignment& alignment) {
+ unsigned cigar_length;
+ if(alignment.type == BWA_TYPE_NO_MATCH) cigar_length = 0;
+ else if(!alignment.cigar) cigar_length = 1;
+ else cigar_length = alignment.n_cigar;
+
+ jcharArray java_cigar_operators = env->NewCharArray(cigar_length);
+ if(java_cigar_operators == NULL) return NULL;
+ jintArray java_cigar_lengths = env->NewIntArray(cigar_length);
+ if(java_cigar_lengths == NULL) return NULL;
+
+ if(alignment.cigar) {
+ for(unsigned cigar_idx = 0; cigar_idx < (unsigned)alignment.n_cigar; ++cigar_idx) {
+ jchar cigar_operator = "MIDS"[alignment.cigar[cigar_idx]>>14];
+ jint cigar_length = alignment.cigar[cigar_idx]&0x3fff;
+
+ env->SetCharArrayRegion(java_cigar_operators,cigar_idx,1,&cigar_operator);
+ if(env->ExceptionCheck()) return NULL;
+ env->SetIntArrayRegion(java_cigar_lengths,cigar_idx,1,&cigar_length);
+ if(env->ExceptionCheck()) return NULL;
+ }
+ }
+ else {
+ if(alignment.type != BWA_TYPE_NO_MATCH) {
+ jchar cigar_operator = 'M';
+ env->SetCharArrayRegion(java_cigar_operators,0,1,&cigar_operator);
+ if(env->ExceptionCheck()) return NULL;
+ env->SetIntArrayRegion(java_cigar_lengths,0,1,&read_length);
+ if(env->ExceptionCheck()) return NULL;
+ }
+ }
+ delete[] alignment.cigar;
+
+ jclass java_alignment_class = env->FindClass("org/broadinstitute/sting/alignment/Alignment");
+ if(java_alignment_class == NULL) return NULL;
+
+ jmethodID java_alignment_constructor = env->GetMethodID(java_alignment_class, "", "(IIZI[C[IILjava/lang/String;IIIII)V");
+ if(java_alignment_constructor == NULL) return NULL;
+
+ jstring java_md = env->NewStringUTF(alignment.md);
+ if(java_md == NULL) return NULL;
+ delete[] alignment.md;
+
+ jobject java_alignment = env->NewObject(java_alignment_class,
+ java_alignment_constructor,
+ alignment.contig,
+ alignment.pos,
+ alignment.negative_strand,
+ alignment.mapping_quality,
+ java_cigar_operators,
+ java_cigar_lengths,
+ alignment.edit_distance,
+ java_md,
+ alignment.num_mismatches,
+ alignment.num_gap_opens,
+ alignment.num_gap_extensions,
+ alignment.num_best,
+ alignment.num_second_best);
+ if(java_alignment == NULL) return NULL;
+
+ env->DeleteLocalRef(java_alignment_class);
+ if(env->ExceptionCheck()) return NULL;
+
+ return java_alignment;
+}
+
+static jstring get_configuration_file(JNIEnv* env, jobject configuration, const char* field_name) {
+ jclass configuration_class = env->GetObjectClass(configuration);
+ if(configuration_class == NULL) return NULL;
+
+ jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/io/File;");
+ if(configuration_field == NULL) return NULL;
+
+ jobject configuration_file = (jobject)env->GetObjectField(configuration,configuration_field);
+
+ jclass file_class = env->FindClass("java/io/File");
+ if(file_class == NULL) return NULL;
+
+ jmethodID path_extractor = env->GetMethodID(file_class,"getAbsolutePath", "()Ljava/lang/String;");
+ if(path_extractor == NULL) return NULL;
+
+ jstring path = (jstring)env->CallObjectMethod(configuration_file,path_extractor);
+ if(path == NULL) return NULL;
+
+ env->DeleteLocalRef(configuration_class);
+ env->DeleteLocalRef(file_class);
+ env->DeleteLocalRef(configuration_file);
+
+ return path;
+}
+
+static void set_int_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, int_setter setter) {
+ jclass configuration_class = env->GetObjectClass(configuration);
+ if(configuration_class == NULL) return;
+
+ jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Integer;");
+ if(configuration_field == NULL) return;
+
+ jobject boxed_value = env->GetObjectField(configuration,configuration_field);
+ if(env->ExceptionCheck()) return;
+
+ if(boxed_value != NULL) {
+ jclass int_box_class = env->FindClass("java/lang/Integer");
+ if(int_box_class == NULL) return;
+
+ jmethodID int_extractor = env->GetMethodID(int_box_class,"intValue", "()I");
+ if(int_extractor == NULL) return;
+
+ jint value = env->CallIntMethod(boxed_value,int_extractor);
+ if(env->ExceptionCheck()) return;
+
+ if(value < 0)
+ {
+ throw_config_value_exception(env,field_name,"cannot be set to a negative value");
+ return;
+ }
+
+ (bwa->*setter)(value);
+
+ env->DeleteLocalRef(int_box_class);
+ }
+
+ env->DeleteLocalRef(boxed_value);
+ env->DeleteLocalRef(configuration_class);
+}
+
+static void set_float_configuration_param(JNIEnv* env, jobject configuration, const char* field_name, BWA* bwa, float_setter setter)
+{
+ jclass configuration_class = env->GetObjectClass(configuration);
+ if(configuration_class == NULL) return;
+
+ jfieldID configuration_field = env->GetFieldID(configuration_class, field_name, "Ljava/lang/Float;");
+ if(configuration_field == NULL) return;
+
+ jobject boxed_value = env->GetObjectField(configuration,configuration_field);
+ if(boxed_value != NULL) {
+ jclass float_box_class = env->FindClass("java/lang/Float");
+ if(float_box_class == NULL) return;
+
+ jmethodID float_extractor = env->GetMethodID(float_box_class,"floatValue", "()F");
+ if(float_extractor == NULL) return;
+
+ jfloat value = env->CallFloatMethod(boxed_value,float_extractor);
+ if(env->ExceptionCheck()) return;
+
+ if(value < 0)
+ {
+ throw_config_value_exception(env,field_name,"cannot be set to a negative value");
+ return;
+ }
+
+ (bwa->*setter)(value);
+
+ env->DeleteLocalRef(float_box_class);
+ }
+
+ env->DeleteLocalRef(boxed_value);
+ env->DeleteLocalRef(configuration_class);
+}
+
+static void throw_config_value_exception(JNIEnv* env, const char* field_name, const char* message)
+{
+ char* buffer = new char[strlen(field_name)+1+strlen(message)+1];
+ sprintf(buffer,"%s %s",field_name,message);
+ jclass sting_exception_class = env->FindClass("org/broadinstitute/sting/utils/StingException");
+ if(sting_exception_class == NULL) return;
+ env->ThrowNew(sting_exception_class, buffer);
+ delete[] buffer;
+}
diff --git a/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h
new file mode 100644
index 000000000..0c44e430a
--- /dev/null
+++ b/public/c/bwa/org_broadinstitute_sting_alignment_bwa_c_BWACAligner.h
@@ -0,0 +1,61 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include
+/* Header for class org_broadinstitute_sting_alignment_bwa_c_BWACAligner */
+
+#ifndef _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+#define _Included_org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: create
+ * Signature: (Lorg/broadinstitute/sting/alignment/bwa/BWTFiles;Lorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_create
+ (JNIEnv *, jobject, jobject, jobject);
+
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: updateConfiguration
+ * Signature: (JLorg/broadinstitute/sting/alignment/bwa/BWAConfiguration;)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_updateConfiguration
+ (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: destroy
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_destroy
+ (JNIEnv *, jobject, jlong);
+
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: getPaths
+ * Signature: (J[B)[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;
+ */
+JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getPaths
+ (JNIEnv *, jobject, jlong, jbyteArray);
+
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: convertPathsToAlignments
+ * Signature: (J[B[Lorg/broadinstitute/sting/alignment/bwa/c/BWAPath;)[Lorg/broadinstitute/sting/alignment/Alignment;
+ */
+JNIEXPORT jobjectArray JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_convertPathsToAlignments
+ (JNIEnv *, jobject, jlong, jbyteArray, jobjectArray);
+
+/*
+ * Class: org_broadinstitute_sting_alignment_bwa_c_BWACAligner
+ * Method: getBestAlignment
+ * Signature: (J[B)Lorg/broadinstitute/sting/alignment/Alignment;
+ */
+JNIEXPORT jobject JNICALL Java_org_broadinstitute_sting_alignment_bwa_c_BWACAligner_getBestAlignment
+ (JNIEnv *, jobject, jlong, jbyteArray);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/public/c/libenvironhack/Makefile b/public/c/libenvironhack/Makefile
new file mode 100644
index 000000000..302ff8e31
--- /dev/null
+++ b/public/c/libenvironhack/Makefile
@@ -0,0 +1,10 @@
+CC=gcc
+CCFLAGS=-Wall -dynamiclib -arch i386 -arch x86_64
+
+libenvironhack.dylib: libenvironhack.c
+ $(CC) $(CCFLAGS) -init _init_environ $< -o $@
+
+all: libenvironhack.dylib
+
+clean:
+ rm -f libenvironhack.dylib
diff --git a/public/c/libenvironhack/libenvironhack.c b/public/c/libenvironhack/libenvironhack.c
new file mode 100644
index 000000000..8b2a2640e
--- /dev/null
+++ b/public/c/libenvironhack/libenvironhack.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2010, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+LSF 7.0.6 on the mac is missing the unsatisfied exported symbol for environ which was removed on MacOS X 10.5+.
+nm $LSF_LIBDIR/liblsf.dylib | grep environ
+See "man environ" for more info, along with http://lists.apple.com/archives/java-dev/2007/Dec/msg00096.html
+*/
+
+#include
+
+char **environ = (char **)0;
+
+void init_environ(void) {
+ environ = (*_NSGetEnviron());
+}
diff --git a/public/c/libenvironhack/libenvironhack.dylib b/public/c/libenvironhack/libenvironhack.dylib
new file mode 100755
index 000000000..a45e038b4
Binary files /dev/null and b/public/c/libenvironhack/libenvironhack.dylib differ
diff --git a/public/java/src/org/broadinstitute/sting/alignment/Aligner.java b/public/java/src/org/broadinstitute/sting/alignment/Aligner.java
new file mode 100644
index 000000000..4bf05cb75
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/Aligner.java
@@ -0,0 +1,49 @@
+package org.broadinstitute.sting.alignment;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Create perfect alignments from the read to the genome represented by the given BWT / suffix array.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public interface Aligner {
+ /**
+ * Close this instance of the BWA pointer and delete its resources.
+ */
+ public void close();
+
+ /**
+ * Allow the aligner to choose one alignment randomly from the pile of best alignments.
+ * @param bases Bases to align.
+ * @return An align
+ */
+ public Alignment getBestAlignment(final byte[] bases);
+
+ /**
+ * Align the read to the reference.
+ * @param read Read to align.
+ * @param header Optional header to drop in place.
+ * @return A list of the alignments.
+ */
+ public SAMRecord align(final SAMRecord read, final SAMFileHeader header);
+
+ /**
+ * Get a iterator of alignments, batched by mapping quality.
+ * @param bases List of bases.
+ * @return Iterator to alignments.
+ */
+ public Iterable getAllAlignments(final byte[] bases);
+
+ /**
+ * Get a iterator of aligned reads, batched by mapping quality.
+ * @param read Read to align.
+ * @param newHeader Optional new header to use when aligning the read. If present, it must be null.
+ * @return Iterator to alignments.
+ */
+ public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader);
+}
+
+
diff --git a/public/java/src/org/broadinstitute/sting/alignment/Alignment.java b/public/java/src/org/broadinstitute/sting/alignment/Alignment.java
new file mode 100644
index 000000000..c63f5615f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/Alignment.java
@@ -0,0 +1,221 @@
+package org.broadinstitute.sting.alignment;
+
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+/**
+ * Represents an alignment of a read to a site in the reference genome.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class Alignment {
+ protected int contigIndex;
+ protected long alignmentStart;
+ protected boolean negativeStrand;
+ protected int mappingQuality;
+
+ protected char[] cigarOperators;
+ protected int[] cigarLengths;
+
+ protected int editDistance;
+ protected String mismatchingPositions;
+
+ protected int numMismatches;
+ protected int numGapOpens;
+ protected int numGapExtensions;
+ protected int bestCount;
+ protected int secondBestCount;
+
+ /**
+ * Gets the index of the given contig.
+ * @return the inde
+ */
+ public int getContigIndex() { return contigIndex; }
+
+ /**
+ * Gets the starting position for the given alignment.
+ * @return Starting position.
+ */
+ public long getAlignmentStart() { return alignmentStart; }
+
+ /**
+ * Is the given alignment on the reverse strand?
+ * @return True if the alignment is on the reverse strand.
+ */
+ public boolean isNegativeStrand() { return negativeStrand; }
+
+ /**
+ * Gets the score of this alignment.
+ * @return The score.
+ */
+ public int getMappingQuality() { return mappingQuality; }
+
+ /**
+ * Gets the edit distance; will eventually end up in the NM SAM tag
+ * if this alignment makes it that far.
+ * @return The edit distance.
+ */
+ public int getEditDistance() { return editDistance; }
+
+ /**
+ * A string representation of which positions mismatch; contents of MD tag.
+ * @return String representation of mismatching positions.
+ */
+ public String getMismatchingPositions() { return mismatchingPositions; }
+
+ /**
+ * Gets the number of mismatches in the read.
+ * @return Number of mismatches.
+ */
+ public int getNumMismatches() { return numMismatches; }
+
+ /**
+ * Get the number of gap opens.
+ * @return Number of gap opens.
+ */
+ public int getNumGapOpens() { return numGapOpens; }
+
+ /**
+ * Get the number of gap extensions.
+ * @return Number of gap extensions.
+ */
+ public int getNumGapExtensions() { return numGapExtensions; }
+
+ /**
+ * Get the number of best alignments.
+ * @return Number of top scoring alignments.
+ */
+ public int getBestCount() { return bestCount; }
+
+ /**
+ * Get the number of second best alignments.
+ * @return Number of second best scoring alignments.
+ */
+ public int getSecondBestCount() { return secondBestCount; }
+
+ /**
+ * Gets the cigar for this alignment.
+ * @return sam-jdk formatted alignment.
+ */
+ public Cigar getCigar() {
+ Cigar cigar = new Cigar();
+ for(int i = 0; i < cigarOperators.length; i++) {
+ CigarOperator operator = CigarOperator.characterToEnum(cigarOperators[i]);
+ cigar.add(new CigarElement(cigarLengths[i],operator));
+ }
+ return cigar;
+ }
+
+ /**
+ * Temporarily implement getCigarString() for debugging; the TextCigarCodec is unfortunately
+ * package-protected.
+ * @return
+ */
+ public String getCigarString() {
+ Cigar cigar = getCigar();
+ if(cigar.isEmpty()) return "*";
+
+ StringBuilder cigarString = new StringBuilder();
+ for(CigarElement element: cigar.getCigarElements()) {
+ cigarString.append(element.getLength());
+ cigarString.append(element.getOperator());
+ }
+ return cigarString.toString();
+ }
+
+ /**
+ * Stub for inheritance.
+ */
+ public Alignment() {}
+
+ /**
+ * Create a new alignment object.
+ * @param contigIndex The contig to which this read aligned.
+ * @param alignmentStart The point within the contig to which this read aligned.
+ * @param negativeStrand Forward or reverse alignment of the given read.
+ * @param mappingQuality How good does BWA think this mapping is?
+ * @param cigarOperators The ordered operators in the cigar string.
+ * @param cigarLengths The lengths to which each operator applies.
+ * @param editDistance The edit distance (cumulative) of the read.
+ * @param mismatchingPositions String representation of which bases in the read mismatch.
+ * @param numMismatches Number of total mismatches in the read.
+ * @param numGapOpens Number of gap opens in the read.
+ * @param numGapExtensions Number of gap extensions in the read.
+ * @param bestCount Number of best alignments in the read.
+ * @param secondBestCount Number of second best alignments in the read.
+ */
+ public Alignment(int contigIndex,
+ int alignmentStart,
+ boolean negativeStrand,
+ int mappingQuality,
+ char[] cigarOperators,
+ int[] cigarLengths,
+ int editDistance,
+ String mismatchingPositions,
+ int numMismatches,
+ int numGapOpens,
+ int numGapExtensions,
+ int bestCount,
+ int secondBestCount) {
+ this.contigIndex = contigIndex;
+ this.alignmentStart = alignmentStart;
+ this.negativeStrand = negativeStrand;
+ this.mappingQuality = mappingQuality;
+ this.cigarOperators = cigarOperators;
+ this.cigarLengths = cigarLengths;
+ this.editDistance = editDistance;
+ this.mismatchingPositions = mismatchingPositions;
+ this.numMismatches = numMismatches;
+ this.numGapOpens = numGapOpens;
+ this.numGapExtensions = numGapExtensions;
+ this.bestCount = bestCount;
+ this.secondBestCount = secondBestCount;
+ }
+
+ /**
+ * Creates a read directly from an alignment.
+ * @param alignment The alignment to convert to a read.
+ * @param unmappedRead Source of the unmapped read. Should have bases, quality scores, and flags.
+ * @param newSAMHeader The new SAM header to use in creating this read. Can be null, but if so, the sequence
+ * dictionary in the
+ * @return A mapped alignment.
+ */
+ public static SAMRecord convertToRead(Alignment alignment, SAMRecord unmappedRead, SAMFileHeader newSAMHeader) {
+ SAMRecord read;
+ try {
+ read = (SAMRecord)unmappedRead.clone();
+ }
+ catch(CloneNotSupportedException ex) {
+ throw new ReviewedStingException("Unable to create aligned read from template.");
+ }
+
+ if(newSAMHeader != null)
+ read.setHeader(newSAMHeader);
+
+ // If we're realigning a previously aligned record, strip out the placement of the alignment.
+ read.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
+ read.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+ read.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
+ read.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+
+ if(alignment != null) {
+ read.setReadUnmappedFlag(false);
+ read.setReferenceIndex(alignment.getContigIndex());
+ read.setAlignmentStart((int)alignment.getAlignmentStart());
+ read.setReadNegativeStrandFlag(alignment.isNegativeStrand());
+ read.setMappingQuality(alignment.getMappingQuality());
+ read.setCigar(alignment.getCigar());
+ if(alignment.isNegativeStrand()) {
+ read.setReadBases(BaseUtils.simpleReverseComplement(read.getReadBases()));
+ read.setBaseQualities(Utils.reverse(read.getBaseQualities()));
+ }
+ read.setAttribute("NM",alignment.getEditDistance());
+ read.setAttribute("MD",alignment.getMismatchingPositions());
+ }
+
+ return read;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java
new file mode 100644
index 000000000..c6755e878
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.alignment;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
+import org.broadinstitute.sting.alignment.bwa.BWTFiles;
+import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.Iterator;
+
+/**
+ * Validates consistency of the aligner interface by taking reads already aligned by BWA in a BAM file, stripping them
+ * of their alignment data, realigning them, and making sure one of the best resulting realignments matches the original
+ * alignment from the input file.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class AlignmentValidationWalker extends ReadWalker {
+ /**
+ * The supporting BWT index generated using BWT.
+ */
+ @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
+ private String prefix = null;
+
+ /**
+ * The instance used to generate alignments.
+ */
+ private BWACAligner aligner = null;
+
+ /**
+ * Create an aligner object. The aligner object will load and hold the BWT until close() is called.
+ */
+ @Override
+ public void initialize() {
+ if(prefix == null)
+ prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
+ BWTFiles bwtFiles = new BWTFiles(prefix);
+ BWAConfiguration configuration = new BWAConfiguration();
+ aligner = new BWACAligner(bwtFiles,configuration);
+ }
+
+ /**
+ * Aligns a read to the given reference.
+ * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
+ * @param read Read to align.
+ * @return Number of reads aligned by this map (aka 1).
+ */
+ @Override
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+ //logger.info(String.format("examining read %s", read.getReadName()));
+
+ byte[] bases = read.getReadBases();
+ if(read.getReadNegativeStrandFlag()) bases = BaseUtils.simpleReverseComplement(bases);
+
+ boolean matches = true;
+ Iterable alignments = aligner.getAllAlignments(bases);
+ Iterator alignmentIterator = alignments.iterator();
+
+ if(!alignmentIterator.hasNext()) {
+ matches = read.getReadUnmappedFlag();
+ }
+ else {
+ Alignment[] alignmentsOfBestQuality = alignmentIterator.next();
+ for(Alignment alignment: alignmentsOfBestQuality) {
+ matches = (alignment.getContigIndex() == read.getReferenceIndex());
+ matches &= (alignment.getAlignmentStart() == read.getAlignmentStart());
+ matches &= (alignment.isNegativeStrand() == read.getReadNegativeStrandFlag());
+ matches &= (alignment.getCigar().equals(read.getCigar()));
+ matches &= (alignment.getMappingQuality() == read.getMappingQuality());
+ if(matches) break;
+ }
+ }
+
+ if(!matches) {
+ logger.error("Found mismatch!");
+ logger.error(String.format("Read %s:",read.getReadName()));
+ logger.error(String.format(" Contig index: %d",read.getReferenceIndex()));
+ logger.error(String.format(" Alignment start: %d", read.getAlignmentStart()));
+ logger.error(String.format(" Negative strand: %b", read.getReadNegativeStrandFlag()));
+ logger.error(String.format(" Cigar: %s%n", read.getCigarString()));
+ logger.error(String.format(" Mapping quality: %s%n", read.getMappingQuality()));
+ for(Alignment[] alignmentsByScore: alignments) {
+ for(int i = 0; i < alignmentsByScore.length; i++) {
+ logger.error(String.format("Alignment %d:",i));
+ logger.error(String.format(" Contig index: %d",alignmentsByScore[i].getContigIndex()));
+ logger.error(String.format(" Alignment start: %d", alignmentsByScore[i].getAlignmentStart()));
+ logger.error(String.format(" Negative strand: %b", alignmentsByScore[i].isNegativeStrand()));
+ logger.error(String.format(" Cigar: %s", alignmentsByScore[i].getCigarString()));
+ logger.error(String.format(" Mapping quality: %s%n", alignmentsByScore[i].getMappingQuality()));
+ }
+ }
+ throw new ReviewedStingException(String.format("Read %s mismatches!", read.getReadName()));
+ }
+
+ return 1;
+ }
+
+ /**
+ * Initial value for reduce. In this case, validated reads will be counted.
+ * @return 0, indicating no reads yet validated.
+ */
+ @Override
+ public Integer reduceInit() { return 0; }
+
+ /**
+ * Calculates the number of reads processed.
+ * @param value Number of reads processed by this map.
+ * @param sum Number of reads processed before this map.
+ * @return Number of reads processed up to and including this map.
+ */
+ @Override
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+
+ /**
+ * Cleanup.
+ * @param result Number of reads processed.
+ */
+ @Override
+ public void onTraversalDone(Integer result) {
+ aligner.close();
+ super.onTraversalDone(result);
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java
new file mode 100644
index 000000000..7064e637f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.alignment;
+
+import net.sf.picard.reference.ReferenceSequenceFileFactory;
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
+import org.broadinstitute.sting.alignment.bwa.BWTFiles;
+import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.gatk.walkers.WalkerName;
+
+import java.io.File;
+
+/**
+ * Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
+ * Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+@WalkerName("Align")
+public class AlignmentWalker extends ReadWalker {
+ @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
+ "generated by bwa index -d bwtsw. If unspecified, will default " +
+ "to the reference specified via the -R argument.",required=false)
+ private File targetReferenceFile = null;
+
+ @Output
+ private StingSAMFileWriter out = null;
+
+ /**
+ * The actual aligner.
+ */
+ private BWACAligner aligner = null;
+
+ /**
+ * New header to use, if desired.
+ */
+ private SAMFileHeader header;
+
+ /**
+ * Create an aligner object. The aligner object will load and hold the BWT until close() is called.
+ */
+ @Override
+ public void initialize() {
+ if(targetReferenceFile == null)
+ targetReferenceFile = getToolkit().getArguments().referenceFile;
+ BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
+ BWAConfiguration configuration = new BWAConfiguration();
+ aligner = new BWACAligner(bwtFiles,configuration);
+
+ // Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
+ header = getToolkit().getSAMFileHeader().clone();
+ SAMSequenceDictionary referenceDictionary =
+ ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
+ header.setSequenceDictionary(referenceDictionary);
+ header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
+
+ out.writeHeader(header);
+ }
+
+ /**
+ * Aligns a read to the given reference.
+ * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
+ * @param read Read to align.
+ * @return Number of alignments found for this read.
+ */
+ @Override
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+ SAMRecord alignedRead = aligner.align(read,header);
+ out.addAlignment(alignedRead);
+ return 1;
+ }
+
+ /**
+ * Initial value for reduce. In this case, alignments will be counted.
+ * @return 0, indicating no alignments yet found.
+ */
+ @Override
+ public Integer reduceInit() { return 0; }
+
+ /**
+ * Calculates the number of alignments found.
+ * @param value Number of alignments found by this map.
+ * @param sum Number of alignments found before this map.
+ * @return Number of alignments found up to and including this map.
+ */
+ @Override
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+
+ /**
+ * Cleanup.
+ * @param result Number of reads processed.
+ */
+ @Override
+ public void onTraversalDone(Integer result) {
+ aligner.close();
+ super.onTraversalDone(result);
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java
new file mode 100644
index 000000000..57d92319f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.alignment;
+
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
+import org.broadinstitute.sting.alignment.bwa.BWTFiles;
+import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+
+import java.io.PrintStream;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
+ * frequency of that number of placements.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class CountBestAlignmentsWalker extends ReadWalker {
+ /**
+ * The supporting BWT index generated using BWT.
+ */
+ @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
+ private String prefix = null;
+
+ @Output
+ private PrintStream out = null;
+
+ /**
+ * The actual aligner.
+ */
+ private Aligner aligner = null;
+
+ private SortedMap alignmentFrequencies = new TreeMap();
+
+ /**
+ * Create an aligner object. The aligner object will load and hold the BWT until close() is called.
+ */
+ @Override
+ public void initialize() {
+ if(prefix == null)
+ prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
+ BWTFiles bwtFiles = new BWTFiles(prefix);
+ BWAConfiguration configuration = new BWAConfiguration();
+ aligner = new BWACAligner(bwtFiles,configuration);
+ }
+
+ /**
+ * Aligns a read to the given reference.
+ * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
+ * @param read Read to align.
+ * @return Number of alignments found for this read.
+ */
+ @Override
+ public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
+ Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
+ if(alignmentIterator.hasNext()) {
+ int numAlignments = alignmentIterator.next().length;
+ if(alignmentFrequencies.containsKey(numAlignments))
+ alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
+ else
+ alignmentFrequencies.put(numAlignments,1);
+ }
+ return 1;
+ }
+
+ /**
+ * Initial value for reduce. In this case, validated reads will be counted.
+ * @return 0, indicating no reads yet validated.
+ */
+ @Override
+ public Integer reduceInit() { return 0; }
+
+ /**
+ * Calculates the number of reads processed.
+ * @param value Number of reads processed by this map.
+ * @param sum Number of reads processed before this map.
+ * @return Number of reads processed up to and including this map.
+ */
+ @Override
+ public Integer reduce(Integer value, Integer sum) {
+ return value + sum;
+ }
+
+ /**
+ * Cleanup.
+ * @param result Number of reads processed.
+ */
+ @Override
+ public void onTraversalDone(Integer result) {
+ aligner.close();
+ for(Map.Entry alignmentFrequency: alignmentFrequencies.entrySet())
+ out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
+ super.onTraversalDone(result);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java
new file mode 100644
index 000000000..ddbf784f5
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java
@@ -0,0 +1,38 @@
+package org.broadinstitute.sting.alignment.bwa;
+
+import org.broadinstitute.sting.alignment.Aligner;
+
+/**
+ * Align reads using BWA.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public abstract class BWAAligner implements Aligner {
+ /**
+ * The supporting files used by BWA.
+ */
+ protected BWTFiles bwtFiles;
+
+ /**
+ * The current configuration for the BWA aligner.
+ */
+ protected BWAConfiguration configuration;
+
+ /**
+ * Create a new BWAAligner. Purpose of this call is to ensure that all BWA constructors accept the correct
+ * parameters.
+ * @param bwtFiles The many files representing BWTs persisted to disk.
+ * @param configuration Configuration parameters for the alignment.
+ */
+ public BWAAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
+ this.bwtFiles = bwtFiles;
+ this.configuration = configuration;
+ }
+
+ /**
+ * Update the configuration passed to the BWA aligner.
+ * @param configuration New configuration to set.
+ */
+ public abstract void updateConfiguration(BWAConfiguration configuration);
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java
new file mode 100644
index 000000000..73441cb6a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWAConfiguration.java
@@ -0,0 +1,44 @@
+package org.broadinstitute.sting.alignment.bwa;
+
+/**
+ * Configuration for the BWA/C aligner.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWAConfiguration {
+ /**
+ * The maximum edit distance used by BWA.
+ */
+ public Float maximumEditDistance = null;
+
+ /**
+ * How many gap opens are acceptable within this alignment?
+ */
+ public Integer maximumGapOpens = null;
+
+ /**
+ * How many gap extensions are acceptable within this alignment?
+ */
+ public Integer maximumGapExtensions = null;
+
+ /**
+ * Do we disallow indels within a certain range from the start / end?
+ */
+ public Integer disallowIndelWithinRange = null;
+
+ /**
+ * What is the scoring penalty for a mismatch?
+ */
+ public Integer mismatchPenalty = null;
+
+ /**
+ * What is the scoring penalty for a gap open?
+ */
+ public Integer gapOpenPenalty = null;
+
+ /**
+ * What is the scoring penalty for a gap extension?
+ */
+ public Integer gapExtensionPenalty = null;
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java
new file mode 100644
index 000000000..a0589ac84
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/BWTFiles.java
@@ -0,0 +1,234 @@
+package org.broadinstitute.sting.alignment.bwa;
+
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+import net.sf.samtools.util.StringUtil;
+import org.broadinstitute.sting.alignment.reference.bwt.*;
+import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * Support files for BWT.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWTFiles {
+ /**
+ * ANN (?) file name.
+ */
+ public final File annFile;
+
+ /**
+ * AMB (?) file name.
+ */
+ public final File ambFile;
+
+ /**
+ * Packed reference sequence file.
+ */
+ public final File pacFile;
+
+ /**
+ * Reverse of packed reference sequence file.
+ */
+ public final File rpacFile;
+
+ /**
+ * Forward BWT file.
+ */
+ public final File forwardBWTFile;
+
+ /**
+ * Forward suffix array file.
+ */
+ public final File forwardSAFile;
+
+ /**
+ * Reverse BWT file.
+ */
+ public final File reverseBWTFile;
+
+ /**
+ * Reverse suffix array file.
+ */
+ public final File reverseSAFile;
+
+ /**
+ * Where these files autogenerated on the fly?
+ */
+ public final boolean autogenerated;
+
+ /**
+ * Create a new BWA configuration file using the given prefix.
+ * @param prefix Prefix to use when creating the configuration. Must not be null.
+ */
+ public BWTFiles(String prefix) {
+ if(prefix == null)
+ throw new ReviewedStingException("Prefix must not be null.");
+ annFile = new File(prefix + ".ann");
+ ambFile = new File(prefix + ".amb");
+ pacFile = new File(prefix + ".pac");
+ rpacFile = new File(prefix + ".rpac");
+ forwardBWTFile = new File(prefix + ".bwt");
+ forwardSAFile = new File(prefix + ".sa");
+ reverseBWTFile = new File(prefix + ".rbwt");
+ reverseSAFile = new File(prefix + ".rsa");
+ autogenerated = false;
+ }
+
+ /**
+ * Hand-create a new BWTFiles object, specifying a unique file object for each type.
+ * @param annFile ANN (alternate dictionary) file.
+ * @param ambFile AMB (holes) files.
+ * @param pacFile Packed representation of the forward reference sequence.
+ * @param forwardBWTFile BWT representation of the forward reference sequence.
+ * @param forwardSAFile SA representation of the forward reference sequence.
+ * @param rpacFile Packed representation of the reversed reference sequence.
+ * @param reverseBWTFile BWT representation of the reversed reference sequence.
+ * @param reverseSAFile SA representation of the reversed reference sequence.
+ */
+ private BWTFiles(File annFile,
+ File ambFile,
+ File pacFile,
+ File forwardBWTFile,
+ File forwardSAFile,
+ File rpacFile,
+ File reverseBWTFile,
+ File reverseSAFile) {
+ this.annFile = annFile;
+ this.ambFile = ambFile;
+ this.pacFile = pacFile;
+ this.forwardBWTFile = forwardBWTFile;
+ this.forwardSAFile = forwardSAFile;
+ this.rpacFile = rpacFile;
+ this.reverseBWTFile = reverseBWTFile;
+ this.reverseSAFile = reverseSAFile;
+ autogenerated = true;
+ }
+
+ /**
+ * Close out this files object, in the process deleting any temporary filse
+ * that were created.
+ */
+ public void close() {
+ if(autogenerated) {
+ boolean success = true;
+ success = annFile.delete();
+ success &= ambFile.delete();
+ success &= pacFile.delete();
+ success &= forwardBWTFile.delete();
+ success &= forwardSAFile.delete();
+ success &= rpacFile.delete();
+ success &= reverseBWTFile.delete();
+ success &= reverseSAFile.delete();
+
+ if(!success)
+ throw new ReviewedStingException("Unable to clean up autogenerated representation");
+ }
+ }
+
+ /**
+ * Create a new set of BWT files from the given reference sequence.
+ * @param referenceSequence Sequence from which to build metadata.
+ * @return A new object representing encoded representations of each sequence.
+ */
+ public static BWTFiles createFromReferenceSequence(byte[] referenceSequence) {
+ byte[] normalizedReferenceSequence = new byte[referenceSequence.length];
+ System.arraycopy(referenceSequence,0,normalizedReferenceSequence,0,referenceSequence.length);
+ normalizeReferenceSequence(normalizedReferenceSequence);
+
+ File annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile;
+ try {
+ // Write the ann and amb for this reference sequence.
+ annFile = File.createTempFile("bwt",".ann");
+ ambFile = File.createTempFile("bwt",".amb");
+
+ SAMSequenceDictionary dictionary = new SAMSequenceDictionary();
+ dictionary.addSequence(new SAMSequenceRecord("autogenerated",normalizedReferenceSequence.length));
+
+ ANNWriter annWriter = new ANNWriter(annFile);
+ annWriter.write(dictionary);
+ annWriter.close();
+
+ AMBWriter ambWriter = new AMBWriter(ambFile);
+ ambWriter.writeEmpty(dictionary);
+ ambWriter.close();
+
+ // Write the encoded files for the forward version of this reference sequence.
+ pacFile = File.createTempFile("bwt",".pac");
+ bwtFile = File.createTempFile("bwt",".bwt");
+ saFile = File.createTempFile("bwt",".sa");
+
+ writeEncodedReferenceSequence(normalizedReferenceSequence,pacFile,bwtFile,saFile);
+
+ // Write the encoded files for the reverse version of this reference sequence.
+ byte[] reverseReferenceSequence = Utils.reverse(normalizedReferenceSequence);
+
+ rpacFile = File.createTempFile("bwt",".rpac");
+ rbwtFile = File.createTempFile("bwt",".rbwt");
+ rsaFile = File.createTempFile("bwt",".rsa");
+
+ writeEncodedReferenceSequence(reverseReferenceSequence,rpacFile,rbwtFile,rsaFile);
+ }
+ catch(IOException ex) {
+ throw new ReviewedStingException("Unable to write autogenerated reference sequence to temporary files");
+ }
+
+ // Make sure that, at the very least, all temporary files are deleted on exit.
+ annFile.deleteOnExit();
+ ambFile.deleteOnExit();
+ pacFile.deleteOnExit();
+ bwtFile.deleteOnExit();
+ saFile.deleteOnExit();
+ rpacFile.deleteOnExit();
+ rbwtFile.deleteOnExit();
+ rsaFile.deleteOnExit();
+
+ return new BWTFiles(annFile,ambFile,pacFile,bwtFile,saFile,rpacFile,rbwtFile,rsaFile);
+ }
+
+ /**
+ * Write the encoded form of the reference sequence. In the case of BWA, the encoded reference
+ * sequence is the reference itself in PAC format, the BWT, and the suffix array.
+ * @param referenceSequence The reference sequence to encode.
+ * @param pacFile Target for the PAC-encoded reference.
+ * @param bwtFile Target for the BWT representation of the reference.
+ * @param suffixArrayFile Target for the suffix array encoding of the reference.
+ * @throws java.io.IOException In case of issues writing to the file.
+ */
+ private static void writeEncodedReferenceSequence(byte[] referenceSequence,
+ File pacFile,
+ File bwtFile,
+ File suffixArrayFile) throws IOException {
+ PackUtils.writeReferenceSequence(pacFile,referenceSequence);
+
+ BWT bwt = BWT.createFromReferenceSequence(referenceSequence);
+ BWTWriter bwtWriter = new BWTWriter(bwtFile);
+ bwtWriter.write(bwt);
+ bwtWriter.close();
+
+ SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
+ SuffixArrayWriter suffixArrayWriter = new SuffixArrayWriter(suffixArrayFile);
+ suffixArrayWriter.write(suffixArray);
+ suffixArrayWriter.close();
+ }
+
+ /**
+ * Convert the given reference sequence into a form suitable for building into
+ * on-the-fly sequences.
+ * @param referenceSequence The reference sequence to normalize.
+ * @throws org.broadinstitute.sting.utils.exceptions.ReviewedStingException if normalized sequence cannot be generated.
+ */
+ private static void normalizeReferenceSequence(byte[] referenceSequence) {
+ StringUtil.toUpperCase(referenceSequence);
+ for(byte base: referenceSequence) {
+ if(base != 'A' && base != 'C' && base != 'G' && base != 'T')
+ throw new ReviewedStingException(String.format("Base type %c is not supported when building references on-the-fly",(char)base));
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java
new file mode 100644
index 000000000..165314259
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWACAligner.java
@@ -0,0 +1,259 @@
+package org.broadinstitute.sting.alignment.bwa.c;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.alignment.Alignment;
+import org.broadinstitute.sting.alignment.bwa.BWAAligner;
+import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
+import org.broadinstitute.sting.alignment.bwa.BWTFiles;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+/**
+ * An aligner using the BWA/C implementation.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWACAligner extends BWAAligner {
+ static {
+ System.loadLibrary("bwa");
+ }
+
+ /**
+ * A pointer to the C++ object representing the BWA engine.
+ */
+ private long thunkPointer = 0;
+
+ public BWACAligner(BWTFiles bwtFiles, BWAConfiguration configuration) {
+ super(bwtFiles,configuration);
+ if(thunkPointer != 0)
+ throw new ReviewedStingException("BWA/C attempting to reinitialize.");
+
+ if(!bwtFiles.annFile.exists()) throw new ReviewedStingException("ANN file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.ambFile.exists()) throw new ReviewedStingException("AMB file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.pacFile.exists()) throw new ReviewedStingException("PAC file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.forwardBWTFile.exists()) throw new ReviewedStingException("Forward BWT file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.forwardSAFile.exists()) throw new ReviewedStingException("Forward SA file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.reverseBWTFile.exists()) throw new ReviewedStingException("Reverse BWT file is missing; please rerun 'bwa aln' to regenerate it.");
+ if(!bwtFiles.reverseSAFile.exists()) throw new ReviewedStingException("Reverse SA file is missing; please rerun 'bwa aln' to regenerate it.");
+
+ thunkPointer = create(bwtFiles,configuration);
+ }
+
+ /**
+ * Create an aligner object using an array of bytes as a reference.
+ * @param referenceSequence Reference sequence to encode ad-hoc.
+ * @param configuration Configuration for the given aligner.
+ */
+ public BWACAligner(byte[] referenceSequence, BWAConfiguration configuration) {
+ this(BWTFiles.createFromReferenceSequence(referenceSequence),configuration);
+ // Now that the temporary files are created, the temporary files can be destroyed.
+ bwtFiles.close();
+ }
+
+ /**
+ * Update the configuration passed to the BWA aligner.
+ * @param configuration New configuration to set.
+ */
+ @Override
+ public void updateConfiguration(BWAConfiguration configuration) {
+ if(thunkPointer == 0)
+ throw new ReviewedStingException("BWA/C: attempting to update configuration of uninitialized aligner.");
+ updateConfiguration(thunkPointer,configuration);
+ }
+
+ /**
+ * Close this instance of the BWA pointer and delete its resources.
+ */
+ @Override
+ public void close() {
+ if(thunkPointer == 0)
+ throw new ReviewedStingException("BWA/C close attempted, but BWA/C is not properly initialized.");
+ destroy(thunkPointer);
+ }
+
+ /**
+ * Allow the aligner to choose one alignment randomly from the pile of best alignments.
+ * @param bases Bases to align.
+ * @return An align
+ */
+ @Override
+ public Alignment getBestAlignment(final byte[] bases) {
+ if(thunkPointer == 0)
+ throw new ReviewedStingException("BWA/C getBestAlignment attempted, but BWA/C is not properly initialized.");
+ return getBestAlignment(thunkPointer,bases);
+ }
+
+ /**
+ * Get the best aligned read, chosen randomly from the pile of best alignments.
+ * @param read Read to align.
+ * @param newHeader New header to apply to this SAM file. Can be null, but if so, read header must be valid.
+ * @return Read with injected alignment data.
+ */
+ @Override
+ public SAMRecord align(final SAMRecord read, final SAMFileHeader newHeader) {
+ if(bwtFiles.autogenerated)
+ throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
+ return Alignment.convertToRead(getBestAlignment(read.getReadBases()),read,newHeader);
+ }
+
+ /**
+ * Get a iterator of alignments, batched by mapping quality.
+ * @param bases List of bases.
+ * @return Iterator to alignments.
+ */
+ @Override
+ public Iterable getAllAlignments(final byte[] bases) {
+ final BWAPath[] paths = getPaths(bases);
+ return new Iterable() {
+ public Iterator iterator() {
+ return new Iterator() {
+ /**
+ * The last position accessed.
+ */
+ private int position = 0;
+
+ /**
+ * Whether all alignments have been seen based on the current position.
+ * @return True if any more alignments are pending. False otherwise.
+ */
+ public boolean hasNext() { return position < paths.length; }
+
+ /**
+ * Return the next cross-section of alignments, based on mapping quality.
+ * @return Array of the next set of alignments of a given mapping quality.
+ */
+ public Alignment[] next() {
+ if(position >= paths.length)
+ throw new UnsupportedOperationException("Out of alignments to return.");
+ int score = paths[position].score;
+ int startingPosition = position;
+ while(position < paths.length && paths[position].score == score) position++;
+ return convertPathsToAlignments(bases,Arrays.copyOfRange(paths,startingPosition,position));
+ }
+
+ /**
+ * Unsupported.
+ */
+ public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
+ };
+ }
+ };
+ }
+
+ /**
+ * Get a iterator of aligned reads, batched by mapping quality.
+ * @param read Read to align.
+ * @param newHeader Optional new header to use when aligning the read. If present, it must be null.
+ * @return Iterator to alignments.
+ */
+ @Override
+ public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) {
+ if(bwtFiles.autogenerated)
+ throw new UnsupportedOperationException("Cannot create target alignment; source contig was generated ad-hoc and is not reliable");
+ final Iterable alignments = getAllAlignments(read.getReadBases());
+ return new Iterable() {
+ public Iterator iterator() {
+ final Iterator alignmentIterator = alignments.iterator();
+ return new Iterator() {
+ /**
+ * Whether all alignments have been seen based on the current position.
+ * @return True if any more alignments are pending. False otherwise.
+ */
+ public boolean hasNext() { return alignmentIterator.hasNext(); }
+
+ /**
+ * Return the next cross-section of alignments, based on mapping quality.
+ * @return Array of the next set of alignments of a given mapping quality.
+ */
+ public SAMRecord[] next() {
+ Alignment[] alignmentsOfQuality = alignmentIterator.next();
+ SAMRecord[] reads = new SAMRecord[alignmentsOfQuality.length];
+ for(int i = 0; i < alignmentsOfQuality.length; i++) {
+ reads[i] = Alignment.convertToRead(alignmentsOfQuality[i],read,newHeader);
+ }
+ return reads;
+ }
+
+ /**
+ * Unsupported.
+ */
+ public void remove() { throw new UnsupportedOperationException("Cannot remove from an alignment iterator"); }
+ };
+ }
+ };
+ }
+
+ /**
+ * Get the paths associated with the given base string.
+ * @param bases List of bases.
+ * @return A set of paths through the BWA.
+ */
+ public BWAPath[] getPaths(byte[] bases) {
+ if(thunkPointer == 0)
+ throw new ReviewedStingException("BWA/C getPaths attempted, but BWA/C is not properly initialized.");
+ return getPaths(thunkPointer,bases);
+ }
+
+ /**
+ * Create a pointer to the BWA/C thunk.
+ * @param files BWT source files.
+ * @param configuration Configuration of the aligner.
+ * @return Pointer to the BWA/C thunk.
+ */
+ protected native long create(BWTFiles files, BWAConfiguration configuration);
+
+ /**
+ * Update the configuration passed to the BWA aligner. For internal use only.
+ * @param thunkPointer pointer to BWA object.
+ * @param configuration New configuration to set.
+ */
+ protected native void updateConfiguration(long thunkPointer, BWAConfiguration configuration);
+
+ /**
+ * Destroy the BWA/C thunk.
+ * @param thunkPointer Pointer to the allocated thunk.
+ */
+ protected native void destroy(long thunkPointer);
+
+ /**
+ * Do the extra steps involved in converting a local alignment to a global alignment.
+ * @param bases ASCII representation of byte array.
+ * @param paths Paths through the current BWT.
+ * @return A list of alignments.
+ */
+ protected Alignment[] convertPathsToAlignments(byte[] bases, BWAPath[] paths) {
+ if(thunkPointer == 0)
+ throw new ReviewedStingException("BWA/C convertPathsToAlignments attempted, but BWA/C is not properly initialized.");
+ return convertPathsToAlignments(thunkPointer,bases,paths);
+ }
+
+ /**
+ * Caller to the path generation functionality within BWA/C. Call this method's getPaths() wrapper (above) instead.
+ * @param thunkPointer pointer to the C++ object managing BWA/C.
+ * @param bases ASCII representation of byte array.
+ * @return A list of paths through the specified BWT.
+ */
+ protected native BWAPath[] getPaths(long thunkPointer, byte[] bases);
+
+ /**
+ * Do the extra steps involved in converting a local alignment to a global alignment.
+ * Call this method's convertPathsToAlignments() wrapper (above) instead.
+ * @param thunkPointer pointer to the C++ object managing BWA/C.
+ * @param bases ASCII representation of byte array.
+ * @param paths Paths through the current BWT.
+ * @return A list of alignments.
+ */
+ protected native Alignment[] convertPathsToAlignments(long thunkPointer, byte[] bases, BWAPath[] paths);
+
+ /**
+ * Gets the best alignment from BWA/C, randomly selected from all best-aligned reads.
+ * @param thunkPointer Pointer to BWA thunk.
+ * @param bases bases to align.
+ * @return The best alignment from BWA/C.
+ */
+ protected native Alignment getBestAlignment(long thunkPointer, byte[] bases);
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java
new file mode 100755
index 000000000..347d4344f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/c/BWAPath.java
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.alignment.bwa.c;
+
+/**
+ * Models a BWA path.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWAPath {
+ /**
+ * Number of mismatches encountered along this path.
+ */
+ public final int numMismatches;
+
+ /**
+ * Number of gap opens encountered along this path.
+ */
+ public final int numGapOpens;
+
+ /**
+ * Number of gap extensions along this path.
+ */
+ public final int numGapExtensions;
+
+ /**
+ * Whether this alignment was found on the positive or negative strand.
+ */
+ public final boolean negativeStrand;
+
+ /**
+ * Starting coordinate in the BWT.
+ */
+ public final long k;
+
+ /**
+ * Ending coordinate in the BWT.
+ */
+ public final long l;
+
+ /**
+ * The score of this path.
+ */
+ public final int score;
+
+ /**
+ * The number of best alignments seen along this path.
+ */
+ public final int bestCount;
+
+ /**
+ * The number of second best alignments seen along this path.
+ */
+ public final int secondBestCount;
+
+ /**
+ * Create a new path with the given attributes.
+ * @param numMismatches Number of mismatches along path.
+ * @param numGapOpens Number of gap opens along path.
+ * @param numGapExtensions Number of gap extensions along path.
+ * @param k Index to first coordinate within BWT.
+ * @param l Index to last coordinate within BWT.
+ * @param score Score of this alignment. Not the mapping quality.
+ */
+ public BWAPath(int numMismatches, int numGapOpens, int numGapExtensions, boolean negativeStrand, long k, long l, int score, int bestCount, int secondBestCount) {
+ this.numMismatches = numMismatches;
+ this.numGapOpens = numGapOpens;
+ this.numGapExtensions = numGapExtensions;
+ this.negativeStrand = negativeStrand;
+ this.k = k;
+ this.l = l;
+ this.score = score;
+ this.bestCount = bestCount;
+ this.secondBestCount = secondBestCount;
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java
new file mode 100644
index 000000000..2d568a96a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignerTestHarness.java
@@ -0,0 +1,164 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.*;
+import org.broadinstitute.sting.alignment.Aligner;
+import org.broadinstitute.sting.alignment.Alignment;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+
+/**
+ * A test harness to ensure that the perfect aligner works.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class AlignerTestHarness {
+ public static void main( String argv[] ) throws FileNotFoundException {
+ if( argv.length != 6 ) {
+ System.out.println("PerfectAlignerTestHarness ");
+ System.exit(1);
+ }
+
+ File referenceFile = new File(argv[0]);
+ File bwtFile = new File(argv[1]);
+ File rbwtFile = new File(argv[2]);
+ File suffixArrayFile = new File(argv[3]);
+ File reverseSuffixArrayFile = new File(argv[4]);
+ File bamFile = new File(argv[5]);
+
+ align(referenceFile,bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile,bamFile);
+ }
+
+ private static void align(File referenceFile, File bwtFile, File rbwtFile, File suffixArrayFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException {
+ Aligner aligner = new BWAJavaAligner(bwtFile,rbwtFile,suffixArrayFile,reverseSuffixArrayFile);
+ int count = 0;
+
+ SAMFileReader reader = new SAMFileReader(bamFile);
+ reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
+
+ int mismatches = 0;
+ int failures = 0;
+
+ for(SAMRecord read: reader) {
+ count++;
+ if( count > 200000 ) break;
+ //if( count < 366000 ) continue;
+ //if( count > 2 ) break;
+ //if( !read.getReadName().endsWith("SL-XBC:1:82:506:404#0") )
+ // continue;
+ //if( !read.getReadName().endsWith("SL-XBC:1:36:30:1926#0") )
+ // continue;
+ //if( !read.getReadName().endsWith("SL-XBC:1:60:1342:1340#0") )
+ // continue;
+
+ SAMRecord alignmentCleaned = null;
+ try {
+ alignmentCleaned = (SAMRecord)read.clone();
+ }
+ catch( CloneNotSupportedException ex ) {
+ throw new ReviewedStingException("SAMRecord clone not supported", ex);
+ }
+
+ if( alignmentCleaned.getReadNegativeStrandFlag() )
+ alignmentCleaned.setReadBases(BaseUtils.simpleReverseComplement(alignmentCleaned.getReadBases()));
+
+ alignmentCleaned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX);
+ alignmentCleaned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
+ alignmentCleaned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
+ alignmentCleaned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
+
+ // Clear everything except flags pertaining to pairing and set 'unmapped' status to true.
+ alignmentCleaned.setFlags(alignmentCleaned.getFlags() & 0x00A1 | 0x000C);
+
+ Iterable alignments = aligner.getAllAlignments(alignmentCleaned.getReadBases());
+ if(!alignments.iterator().hasNext() ) {
+ //throw new StingException(String.format("Unable to align read %s to reference; count = %d",read.getReadName(),count));
+ System.out.printf("Unable to align read %s to reference; count = %d%n",read.getReadName(),count);
+ failures++;
+ }
+
+ Alignment foundAlignment = null;
+ for(Alignment[] alignmentsOfQuality: alignments) {
+ for(Alignment alignment: alignmentsOfQuality) {
+ if( read.getReadNegativeStrandFlag() != alignment.isNegativeStrand() )
+ continue;
+ if( read.getAlignmentStart() != alignment.getAlignmentStart() )
+ continue;
+
+ foundAlignment = alignment;
+ }
+ }
+
+ if( foundAlignment != null ) {
+ //System.out.printf("%s: Aligned read to reference at position %d with %d mismatches, %d gap opens, and %d gap extensions.%n", read.getReadName(), foundAlignment.getAlignmentStart(), foundAlignment.getMismatches(), foundAlignment.getGapOpens(), foundAlignment.getGapExtensions());
+ }
+ else {
+ System.out.printf("Error aligning read %s%n", read.getReadName());
+
+ mismatches++;
+
+ IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
+
+ System.out.printf("read = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(read.getReadString(),read.getCigar(),CigarOperator.DELETION),
+ read.getAlignmentStart(),
+ read.getReadNegativeStrandFlag());
+ int numDeletions = numDeletionsInCigar(read.getCigar());
+ String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()+numDeletions-1).getBases());
+ System.out.printf("expected ref = %s%n", formatBasesBasedOnCigar(expectedRef,read.getCigar(),CigarOperator.INSERTION));
+
+ for(Alignment[] alignmentsOfQuality: alignments) {
+ for(Alignment alignment: alignmentsOfQuality) {
+ System.out.println();
+
+ Cigar cigar = ((BWAAlignment)alignment).getCigar();
+
+ System.out.printf("read = %s%n", formatBasesBasedOnCigar(read.getReadString(),cigar,CigarOperator.DELETION));
+
+ int deletionCount = ((BWAAlignment)alignment).getNumberOfBasesMatchingState(AlignmentState.DELETION);
+ String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignment.getAlignmentStart(),alignment.getAlignmentStart()+read.getReadLength()+deletionCount-1).getBases());
+ System.out.printf("actual ref = %s, position = %d, negative strand = %b%n", formatBasesBasedOnCigar(alignedRef,cigar,CigarOperator.INSERTION),
+ alignment.getAlignmentStart(),
+ alignment.isNegativeStrand());
+ }
+ }
+
+ //throw new StingException(String.format("Read %s was placed at incorrect location; count = %d%n",read.getReadName(),count));
+ }
+
+
+ if( count % 1000 == 0 )
+ System.out.printf("%d reads examined.%n",count);
+ }
+
+ System.out.printf("%d reads examined; %d mismatches; %d failures.%n",count,mismatches,failures);
+ }
+
+ private static String formatBasesBasedOnCigar( String bases, Cigar cigar, CigarOperator toBlank ) {
+ StringBuilder formatted = new StringBuilder();
+ int readIndex = 0;
+ for(CigarElement cigarElement: cigar.getCigarElements()) {
+ if(cigarElement.getOperator() == toBlank) {
+ int number = cigarElement.getLength();
+ while( number-- > 0 ) formatted.append(' ');
+ }
+ else {
+ int number = cigarElement.getLength();
+ while( number-- > 0 ) formatted.append(bases.charAt(readIndex++));
+ }
+ }
+ return formatted.toString();
+ }
+
+ private static int numDeletionsInCigar( Cigar cigar ) {
+ int numDeletions = 0;
+ for(CigarElement cigarElement: cigar.getCigarElements()) {
+ if(cigarElement.getOperator() == CigarOperator.DELETION)
+ numDeletions += cigarElement.getLength();
+ }
+ return numDeletions;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java
new file mode 100644
index 000000000..f1e3c31b6
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentMatchSequence.java
@@ -0,0 +1,150 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.Iterator;
+
+/**
+ * Represents a sequence of matches.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class AlignmentMatchSequence implements Cloneable {
+ /**
+ * Stores the particular match entries in the order they occur.
+ */
+ private Deque entries = new ArrayDeque();
+
+ /**
+ * Clone the given match sequence.
+ * @return A deep copy of the current match sequence.
+ */
+ public AlignmentMatchSequence clone() {
+ AlignmentMatchSequence copy = null;
+ try {
+ copy = (AlignmentMatchSequence)super.clone();
+ }
+ catch( CloneNotSupportedException ex ) {
+ throw new ReviewedStingException("Unable to clone AlignmentMatchSequence.");
+ }
+
+ copy.entries = new ArrayDeque();
+ for( AlignmentMatchSequenceEntry entry: entries )
+ copy.entries.add(entry.clone());
+
+ return copy;
+ }
+
+ public Cigar convertToCigar(boolean negativeStrand) {
+ Cigar cigar = new Cigar();
+ Iterator iterator = negativeStrand ? entries.descendingIterator() : entries.iterator();
+ while( iterator.hasNext() ) {
+ AlignmentMatchSequenceEntry entry = iterator.next();
+ CigarOperator operator;
+ switch( entry.getAlignmentState() ) {
+ case MATCH_MISMATCH: operator = CigarOperator.MATCH_OR_MISMATCH; break;
+ case INSERTION: operator = CigarOperator.INSERTION; break;
+ case DELETION: operator = CigarOperator.DELETION; break;
+ default: throw new ReviewedStingException("convertToCigar: cannot process state: " + entry.getAlignmentState());
+ }
+ cigar.add( new CigarElement(entry.count,operator) );
+ }
+ return cigar;
+ }
+
+ /**
+ * All a new alignment of the given state.
+ * @param state State to add to the sequence.
+ */
+ public void addNext( AlignmentState state ) {
+ AlignmentMatchSequenceEntry last = entries.peekLast();
+ // If the last entry is the same as this one, increment it. Otherwise, add a new entry.
+ if( last != null && last.alignmentState == state )
+ last.increment();
+ else
+ entries.add(new AlignmentMatchSequenceEntry(state));
+ }
+
+ /**
+ * Gets the current state of this alignment (what's the state of the last base?)
+ * @return State of the most recently aligned base.
+ */
+ public AlignmentState getCurrentState() {
+ if( entries.size() == 0 )
+ return AlignmentState.MATCH_MISMATCH;
+ return entries.peekLast().getAlignmentState();
+ }
+
+ /**
+ * How many bases in the read match the given state.
+ * @param state State to test.
+ * @return number of bases which match that state.
+ */
+ public int getNumberOfBasesMatchingState(AlignmentState state) {
+ int matches = 0;
+ for( AlignmentMatchSequenceEntry entry: entries ) {
+ if( entry.getAlignmentState() == state )
+ matches += entry.count;
+ }
+ return matches;
+ }
+
+ /**
+ * Stores an individual match sequence entry.
+ */
+ private class AlignmentMatchSequenceEntry implements Cloneable {
+ /**
+ * The state of the alignment throughout a given point in the sequence.
+ */
+ private final AlignmentState alignmentState;
+
+ /**
+ * The number of bases having this particular state.
+ */
+ private int count;
+
+ /**
+ * Create a new sequence entry with the given state.
+ * @param alignmentState The state that this sequence should contain.
+ */
+ AlignmentMatchSequenceEntry( AlignmentState alignmentState ) {
+ this.alignmentState = alignmentState;
+ this.count = 1;
+ }
+
+ /**
+ * Clone the given match sequence entry.
+ * @return A deep copy of the current match sequence entry.
+ */
+ public AlignmentMatchSequenceEntry clone() {
+ try {
+ return (AlignmentMatchSequenceEntry)super.clone();
+ }
+ catch( CloneNotSupportedException ex ) {
+ throw new ReviewedStingException("Unable to clone AlignmentMatchSequenceEntry.");
+ }
+ }
+
+ /**
+ * Retrieves the current state of the alignment.
+ * @return The state of the current sequence.
+ */
+ AlignmentState getAlignmentState() {
+ return alignmentState;
+ }
+
+ /**
+ * Increment the count of alignments having this particular state.
+ */
+ void increment() {
+ count++;
+ }
+ }
+}
+
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java
new file mode 100644
index 000000000..92c603335
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/AlignmentState.java
@@ -0,0 +1,13 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+/**
+ * The state of a given base in the alignment.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public enum AlignmentState {
+ MATCH_MISMATCH,
+ INSERTION,
+ DELETION
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java
new file mode 100644
index 000000000..f3b515dba
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAAlignment.java
@@ -0,0 +1,190 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+import net.sf.samtools.Cigar;
+import org.broadinstitute.sting.alignment.Alignment;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+/**
+ * An alignment object to be used incrementally as the BWA aligner
+ * inspects the read.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWAAlignment extends Alignment implements Cloneable {
+ /**
+ * Track the number of alignments that have been created.
+ */
+ private static long numCreated;
+
+ /**
+ * Which number alignment is this?
+ */
+ private long creationNumber;
+
+ /**
+ * The aligner performing the alignments.
+ */
+ protected BWAJavaAligner aligner;
+
+ /**
+ * The sequence of matches/mismatches/insertions/deletions.
+ */
+ private AlignmentMatchSequence alignmentMatchSequence = new AlignmentMatchSequence();
+
+ /**
+ * Working variable. How many bases have been matched at this point.
+ */
+ protected int position;
+
+ /**
+ * Working variable. How many mismatches have been encountered at this point.
+ */
+ private int mismatches;
+
+ /**
+ * Number of gap opens in alignment.
+ */
+ private int gapOpens;
+
+ /**
+ * Number of gap extensions in alignment.
+ */
+ private int gapExtensions;
+
+ /**
+ * Working variable. The lower bound of the alignment within the BWT.
+ */
+ protected long loBound;
+
+ /**
+ * Working variable. The upper bound of the alignment within the BWT.
+ */
+ protected long hiBound;
+
+ protected void setAlignmentStart(long position) {
+ this.alignmentStart = position;
+ }
+
+ protected void setNegativeStrand(boolean negativeStrand) {
+ this.negativeStrand = negativeStrand;
+ }
+
+ /**
+ * Cache the score.
+ */
+ private int score;
+
+ public Cigar getCigar() {
+ return alignmentMatchSequence.convertToCigar(isNegativeStrand());
+ }
+
+ /**
+ * Gets the current state of this alignment (state of the last base viewed)..
+ * @return Current state of the alignment.
+ */
+ public AlignmentState getCurrentState() {
+ return alignmentMatchSequence.getCurrentState();
+ }
+
+ /**
+ * Adds the given state to the current alignment.
+ * @param state State to add to the given alignment.
+ */
+ public void addState( AlignmentState state ) {
+ alignmentMatchSequence.addNext(state);
+ }
+
+ /**
+ * Gets the BWA score of this alignment.
+ * @return BWA-style scores. 0 is best.
+ */
+ public int getScore() {
+ return score;
+ }
+
+ public int getMismatches() { return mismatches; }
+ public int getGapOpens() { return gapOpens; }
+ public int getGapExtensions() { return gapExtensions; }
+
+ public void incrementMismatches() {
+ this.mismatches++;
+ updateScore();
+ }
+
+ public void incrementGapOpens() {
+ this.gapOpens++;
+ updateScore();
+ }
+
+ public void incrementGapExtensions() {
+ this.gapExtensions++;
+ updateScore();
+ }
+
+ /**
+ * Updates the score based on new information about matches / mismatches.
+ */
+ private void updateScore() {
+ score = mismatches*aligner.MISMATCH_PENALTY + gapOpens*aligner.GAP_OPEN_PENALTY + gapExtensions*aligner.GAP_EXTENSION_PENALTY;
+ }
+
+ /**
+ * Create a new alignment with the given parent aligner.
+ * @param aligner Aligner being used.
+ */
+ public BWAAlignment( BWAJavaAligner aligner ) {
+ this.aligner = aligner;
+ this.creationNumber = numCreated++;
+ }
+
+ /**
+ * Clone the alignment.
+ * @return New instance of the alignment.
+ */
+ public BWAAlignment clone() {
+ BWAAlignment newAlignment = null;
+ try {
+ newAlignment = (BWAAlignment)super.clone();
+ }
+ catch( CloneNotSupportedException ex ) {
+ throw new ReviewedStingException("Unable to clone BWAAlignment.");
+ }
+ newAlignment.creationNumber = numCreated++;
+ newAlignment.alignmentMatchSequence = alignmentMatchSequence.clone();
+
+ return newAlignment;
+ }
+
+ /**
+ * How many bases in the read match the given state.
+ * @param state State to test.
+ * @return number of bases which match that state.
+ */
+ public int getNumberOfBasesMatchingState(AlignmentState state) {
+ return alignmentMatchSequence.getNumberOfBasesMatchingState(state);
+ }
+
+ /**
+ * Compare this alignment to another alignment.
+ * @param rhs Other alignment to which to compare.
+ * @return < 0 if this < other, == 0 if this == other, > 0 if this > other
+ */
+ public int compareTo(Alignment rhs) {
+ BWAAlignment other = (BWAAlignment)rhs;
+
+ // If the scores are different, disambiguate using the score.
+ if(score != other.score)
+ return score > other.score ? 1 : -1;
+
+ // Otherwise, use the order in which the elements were created.
+ if(creationNumber != other.creationNumber)
+ return creationNumber > other.creationNumber ? -1 : 1;
+
+ return 0;
+ }
+
+ public String toString() {
+ return String.format("position: %d, strand: %b, state: %s, mismatches: %d, gap opens: %d, gap extensions: %d, loBound: %d, hiBound: %d, score: %d, creationNumber: %d", position, negativeStrand, alignmentMatchSequence.getCurrentState(), mismatches, gapOpens, gapExtensions, loBound, hiBound, getScore(), creationNumber);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java
new file mode 100644
index 000000000..fbeac9192
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/BWAJavaAligner.java
@@ -0,0 +1,393 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+import net.sf.samtools.SAMFileHeader;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.alignment.Alignment;
+import org.broadinstitute.sting.alignment.bwa.BWAAligner;
+import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
+import org.broadinstitute.sting.alignment.reference.bwt.*;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Utils;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.PriorityQueue;
+
+/**
+ * Create imperfect alignments from the read to the genome represented by the given BWT / suffix array.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWAJavaAligner extends BWAAligner {
+ /**
+ * BWT in the forward direction.
+ */
+ private BWT forwardBWT;
+
+ /**
+ * BWT in the reverse direction.
+ */
+ private BWT reverseBWT;
+
+ /**
+ * Suffix array in the forward direction.
+ */
+ private SuffixArray forwardSuffixArray;
+
+ /**
+ * Suffix array in the reverse direction.
+ */
+ private SuffixArray reverseSuffixArray;
+
+ /**
+ * Maximum edit distance (-n option from original BWA).
+ */
+ private final int MAXIMUM_EDIT_DISTANCE = 4;
+
+ /**
+ * Maximum number of gap opens (-o option from original BWA).
+ */
+ private final int MAXIMUM_GAP_OPENS = 1;
+
+ /**
+ * Maximum number of gap extensions (-e option from original BWA).
+ */
+ private final int MAXIMUM_GAP_EXTENSIONS = 6;
+
+ /**
+ * Penalty for straight mismatches (-M option from original BWA).
+ */
+ public final int MISMATCH_PENALTY = 3;
+
+ /**
+ * Penalty for gap opens (-O option from original BWA).
+ */
+ public final int GAP_OPEN_PENALTY = 11;
+
+ /**
+ * Penalty for gap extensions (-E option from original BWA).
+ */
+ public final int GAP_EXTENSION_PENALTY = 4;
+
+ /**
+ * Skip the ends of indels.
+ */
+ public final int INDEL_END_SKIP = 5;
+
+ public BWAJavaAligner( File forwardBWTFile, File reverseBWTFile, File forwardSuffixArrayFile, File reverseSuffixArrayFile ) {
+ super(null,null);
+ forwardBWT = new BWTReader(forwardBWTFile).read();
+ reverseBWT = new BWTReader(reverseBWTFile).read();
+ forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile,forwardBWT).read();
+ reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile,reverseBWT).read();
+ }
+
+ /**
+ * Close this instance of the BWA pointer and delete its resources.
+ */
+ @Override
+ public void close() {
+ throw new UnsupportedOperationException("BWA aligner can't currently be closed.");
+ }
+
+ /**
+ * Update the current parameters of this aligner.
+ * @param configuration New configuration to set.
+ */
+ public void updateConfiguration(BWAConfiguration configuration) {
+ throw new UnsupportedOperationException("Configuration of the BWA aligner can't currently be changed.");
+ }
+
+ /**
+ * Allow the aligner to choose one alignment randomly from the pile of best alignments.
+ * @param bases Bases to align.
+ * @return An align
+ */
+ public Alignment getBestAlignment(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
+
+ /**
+ * Align the read to the reference.
+ * @param read Read to align.
+ * @param header Optional header to drop in place.
+ * @return A list of the alignments.
+ */
+ public SAMRecord align(final SAMRecord read, final SAMFileHeader header) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
+
+ /**
+ * Get a iterator of alignments, batched by mapping quality.
+ * @param bases List of bases.
+ * @return Iterator to alignments.
+ */
+ public Iterable getAllAlignments(final byte[] bases) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
+
+ /**
+ * Get a iterator of aligned reads, batched by mapping quality.
+ * @param read Read to align.
+ * @param newHeader Optional new header to use when aligning the read. If present, it must be null.
+ * @return Iterator to alignments.
+ */
+ public Iterable alignAll(final SAMRecord read, final SAMFileHeader newHeader) { throw new UnsupportedOperationException("BWAJavaAligner does not yet support the standard Aligner interface."); }
+
+
+ public List align( SAMRecord read ) {
+ List successfulMatches = new ArrayList();
+
+ Byte[] uncomplementedBases = normalizeBases(read.getReadBases());
+ Byte[] complementedBases = normalizeBases(Utils.reverse(BaseUtils.simpleReverseComplement(read.getReadBases())));
+
+ List forwardLowerBounds = LowerBound.create(uncomplementedBases,forwardBWT);
+ List reverseLowerBounds = LowerBound.create(complementedBases,reverseBWT);
+
+ // Seed the best score with any score that won't overflow on comparison.
+ int bestScore = Integer.MAX_VALUE - MISMATCH_PENALTY;
+ int bestDiff = MAXIMUM_EDIT_DISTANCE+1;
+ int maxDiff = MAXIMUM_EDIT_DISTANCE;
+
+ PriorityQueue alignments = new PriorityQueue();
+
+ // Create a fictional initial alignment, with the position just off the end of the read, and the limits
+ // set as the entire BWT.
+ alignments.add(createSeedAlignment(reverseBWT));
+ alignments.add(createSeedAlignment(forwardBWT));
+
+ while(!alignments.isEmpty()) {
+ BWAAlignment alignment = alignments.remove();
+
+ // From bwtgap.c in the original BWT; if the rank is worse than the best score + the mismatch PENALTY, move on.
+ if( alignment.getScore() > bestScore + MISMATCH_PENALTY )
+ break;
+
+ Byte[] bases = alignment.isNegativeStrand() ? complementedBases : uncomplementedBases;
+ BWT bwt = alignment.isNegativeStrand() ? forwardBWT : reverseBWT;
+ List lowerBounds = alignment.isNegativeStrand() ? reverseLowerBounds : forwardLowerBounds;
+
+ // if z < D(i) then return {}
+ int mismatches = maxDiff - alignment.getMismatches() - alignment.getGapOpens() - alignment.getGapExtensions();
+ if( alignment.position < lowerBounds.size()-1 && mismatches < lowerBounds.get(alignment.position+1).value )
+ continue;
+
+ if(mismatches == 0) {
+ exactMatch(alignment,bases,bwt);
+ if(alignment.loBound > alignment.hiBound)
+ continue;
+ }
+
+ // Found a valid alignment; store it and move on.
+ if(alignment.position >= read.getReadLength()-1) {
+ for(long bwtIndex = alignment.loBound; bwtIndex <= alignment.hiBound; bwtIndex++) {
+ BWAAlignment finalAlignment = alignment.clone();
+
+ if( finalAlignment.isNegativeStrand() )
+ finalAlignment.setAlignmentStart(forwardSuffixArray.get(bwtIndex) + 1);
+ else {
+ int sizeAlongReference = read.getReadLength() -
+ finalAlignment.getNumberOfBasesMatchingState(AlignmentState.INSERTION) +
+ finalAlignment.getNumberOfBasesMatchingState(AlignmentState.DELETION);
+ finalAlignment.setAlignmentStart(reverseBWT.length() - reverseSuffixArray.get(bwtIndex) - sizeAlongReference + 1);
+ }
+
+ successfulMatches.add(finalAlignment);
+
+ bestScore = Math.min(finalAlignment.getScore(),bestScore);
+ bestDiff = Math.min(finalAlignment.getMismatches()+finalAlignment.getGapOpens()+finalAlignment.getGapExtensions(),bestDiff);
+ maxDiff = bestDiff + 1;
+ }
+
+ continue;
+ }
+
+ //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d, base = %s%n", alignments.size(), alignment, lowerBounds.get(alignment.position+1).value, alignment.position >= 0 ? (char)bases[alignment.position].byteValue() : "");
+ /*
+ System.out.printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%d,%d]\t[%d,%d]%n",alignments.size(),
+ alignment.negativeStrand?1:0,
+ bases.length-alignment.position-1,
+ alignment.getCurrentState().toString().charAt(0),
+ alignment.getMismatches(),
+ alignment.getGapOpens(),
+ alignment.getGapExtensions(),
+ lowerBounds.get(alignment.position+1).value,
+ lowerBounds.get(alignment.position+1).width,
+ alignment.loBound,
+ alignment.hiBound);
+ */
+
+ // Temporary -- look ahead to see if the next alignment is bounded.
+ boolean allowDifferences = mismatches > 0;
+ boolean allowMismatches = mismatches > 0;
+
+ if( allowDifferences &&
+ alignment.position+1 >= INDEL_END_SKIP-1+alignment.getGapOpens()+alignment.getGapExtensions() &&
+ read.getReadLength()-1-(alignment.position+1) >= INDEL_END_SKIP+alignment.getGapOpens()+alignment.getGapExtensions() ) {
+ if( alignment.getCurrentState() == AlignmentState.MATCH_MISMATCH ) {
+ if( alignment.getGapOpens() < MAXIMUM_GAP_OPENS ) {
+ // Add a potential insertion extension.
+ BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
+ insertionAlignment.incrementGapOpens();
+ alignments.add(insertionAlignment);
+
+ // Add a potential deletion by marking a deletion and augmenting the position.
+ List deletionAlignments = createDeletionAlignments(bwt,alignment);
+ for( BWAAlignment deletionAlignment: deletionAlignments )
+ deletionAlignment.incrementGapOpens();
+ alignments.addAll(deletionAlignments);
+ }
+ }
+ else if( alignment.getCurrentState() == AlignmentState.INSERTION ) {
+ if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
+ // Add a potential insertion extension.
+ BWAAlignment insertionAlignment = createInsertionAlignment(alignment);
+ insertionAlignment.incrementGapExtensions();
+ alignments.add(insertionAlignment);
+ }
+ }
+ else if( alignment.getCurrentState() == AlignmentState.DELETION ) {
+ if( alignment.getGapExtensions() < MAXIMUM_GAP_EXTENSIONS && mismatches > 0 ) {
+ // Add a potential deletion by marking a deletion and augmenting the position.
+ List deletionAlignments = createDeletionAlignments(bwt,alignment);
+ for( BWAAlignment deletionAlignment: deletionAlignments )
+ deletionAlignment.incrementGapExtensions();
+ alignments.addAll(deletionAlignments);
+ }
+ }
+ }
+
+ // Mismatches
+ alignments.addAll(createMatchedAlignments(bwt,alignment,bases,allowDifferences&&allowMismatches));
+ }
+
+ return successfulMatches;
+ }
+
+ /**
+ * Create an seeding alignment to use as a starting point when traversing.
+ * @param bwt source BWT.
+ * @return Seed alignment.
+ */
+ private BWAAlignment createSeedAlignment(BWT bwt) {
+ BWAAlignment seed = new BWAAlignment(this);
+ seed.setNegativeStrand(bwt == forwardBWT);
+ seed.position = -1;
+ seed.loBound = 0;
+ seed.hiBound = bwt.length();
+ return seed;
+ }
+
+ /**
+ * Creates a new alignments representing direct matches / mismatches.
+ * @param bwt Source BWT with which to work.
+ * @param alignment Alignment for the previous position.
+ * @param bases The bases in the read.
+ * @param allowMismatch Should mismatching bases be allowed?
+ * @return New alignment representing this position if valid; null otherwise.
+ */
+ private List createMatchedAlignments( BWT bwt, BWAAlignment alignment, Byte[] bases, boolean allowMismatch ) {
+ List newAlignments = new ArrayList();
+
+ List baseChoices = new ArrayList();
+ Byte thisBase = bases[alignment.position+1];
+
+ if( allowMismatch )
+ baseChoices.addAll(Bases.allOf());
+ else
+ baseChoices.add(thisBase);
+
+ if( thisBase != null ) {
+ // Keep rotating the current base to the last position until we've hit the current base.
+ for( ;; ) {
+ baseChoices.add(baseChoices.remove(0));
+ if( thisBase.equals(baseChoices.get(baseChoices.size()-1)) )
+ break;
+
+ }
+ }
+
+ for(byte base: baseChoices) {
+ BWAAlignment newAlignment = alignment.clone();
+
+ newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
+ newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
+
+ // If this alignment is valid, skip it.
+ if( newAlignment.loBound > newAlignment.hiBound )
+ continue;
+
+ newAlignment.position++;
+ newAlignment.addState(AlignmentState.MATCH_MISMATCH);
+ if( bases[newAlignment.position] == null || base != bases[newAlignment.position] )
+ newAlignment.incrementMismatches();
+
+ newAlignments.add(newAlignment);
+ }
+
+ return newAlignments;
+ }
+
+ /**
+ * Create a new alignment representing an insertion at this point in the read.
+ * @param alignment Alignment from which to derive the insertion.
+ * @return New alignment reflecting the insertion.
+ */
+ private BWAAlignment createInsertionAlignment( BWAAlignment alignment ) {
+ // Add a potential insertion extension.
+ BWAAlignment newAlignment = alignment.clone();
+ newAlignment.position++;
+ newAlignment.addState(AlignmentState.INSERTION);
+ return newAlignment;
+ }
+
+ /**
+ * Create new alignments representing a deletion at this point in the read.
+ * @param bwt source BWT for inferring deletion info.
+ * @param alignment Alignment from which to derive the deletion.
+ * @return New alignments reflecting all possible deletions.
+ */
+ private List createDeletionAlignments( BWT bwt, BWAAlignment alignment) {
+ List newAlignments = new ArrayList();
+ for(byte base: Bases.instance) {
+ BWAAlignment newAlignment = alignment.clone();
+
+ newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
+ newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
+
+ // If this alignment is valid, skip it.
+ if( newAlignment.loBound > newAlignment.hiBound )
+ continue;
+
+ newAlignment.addState(AlignmentState.DELETION);
+
+ newAlignments.add(newAlignment);
+ }
+
+ return newAlignments;
+ }
+
+ /**
+ * Exactly match the given alignment against the given BWT.
+ * @param alignment Alignment to match.
+ * @param bases Bases to use.
+ * @param bwt BWT to use.
+ */
+ private void exactMatch( BWAAlignment alignment, Byte[] bases, BWT bwt ) {
+ while( ++alignment.position < bases.length ) {
+ byte base = bases[alignment.position];
+ alignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
+ alignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
+ if( alignment.loBound > alignment.hiBound )
+ return;
+ }
+ }
+
+ /**
+ * Make each base into A/C/G/T or null if unknown.
+ * @param bases Base string to normalize.
+ * @return Array of normalized bases.
+ */
+ private Byte[] normalizeBases( byte[] bases ) {
+ Byte[] normalBases = new Byte[bases.length];
+ for(int i = 0; i < bases.length; i++)
+ normalBases[i] = Bases.fromASCII(bases[i]);
+ return normalBases;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java
new file mode 100644
index 000000000..be7514255
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/bwa/java/LowerBound.java
@@ -0,0 +1,88 @@
+package org.broadinstitute.sting.alignment.bwa.java;
+
+import org.broadinstitute.sting.alignment.reference.bwt.BWT;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * At any point along the given read, what is a good lower bound for the
+ * total number of differences?
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class LowerBound {
+ /**
+ * Lower bound of the suffix array.
+ */
+ public final long loIndex;
+
+ /**
+ * Upper bound of the suffix array.
+ */
+ public final long hiIndex;
+
+ /**
+ * Width of the bwt from loIndex -> hiIndex, inclusive.
+ */
+ public final long width;
+
+ /**
+ * The lower bound at the given point.
+ */
+ public final int value;
+
+ /**
+ * Create a new lower bound with the given value.
+ * @param loIndex The lower bound of the BWT.
+ * @param hiIndex The upper bound of the BWT.
+ * @param value Value for the lower bound at this site.
+ */
+ private LowerBound(long loIndex, long hiIndex, int value) {
+ this.loIndex = loIndex;
+ this.hiIndex = hiIndex;
+ this.width = hiIndex - loIndex + 1;
+ this.value = value;
+ }
+
+ /**
+ * Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper.
+ * @param bases Bases of the read to use when creating a new BWT.
+ * @param bwt BWT to check against.
+ * @return A list of lower bounds at every point in the reference.
+ *
+ */
+ public static List create(Byte[] bases, BWT bwt) {
+ List bounds = new ArrayList();
+
+ long loIndex = 0, hiIndex = bwt.length();
+ int mismatches = 0;
+ for( int i = bases.length-1; i >= 0; i-- ) {
+ Byte base = bases[i];
+
+ // Ignore non-ACGT bases.
+ if( base != null ) {
+ loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1;
+ hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex);
+ }
+
+ if( base == null || loIndex > hiIndex ) {
+ loIndex = 0;
+ hiIndex = bwt.length();
+ mismatches++;
+ }
+ bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches));
+ }
+
+ return bounds;
+ }
+
+ /**
+ * Create a string representation of this bound.
+ * @return String version of this bound.
+ */
+ public String toString() {
+ return String.format("LowerBound: w = %d, value = %d",width,value);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/package-info.java b/public/java/src/org/broadinstitute/sting/alignment/package-info.java
new file mode 100644
index 000000000..60cf1e425
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/package-info.java
@@ -0,0 +1,4 @@
+/**
+ * Analyses used to validate the correctness and performance the BWA Java bindings.
+ */
+package org.broadinstitute.sting.alignment;
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java
new file mode 100644
index 000000000..ec10415dd
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/AMBWriter.java
@@ -0,0 +1,68 @@
+package org.broadinstitute.sting.alignment.reference.bwt;
+
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+
+/**
+ * Writes .amb files - a file indicating where 'holes' (indeterminant bases)
+ * exist in the contig. Currently, only empty, placeholder AMBs are supported.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class AMBWriter {
+ /**
+ * Number of holes is fixed at zero.
+ */
+ private static final int NUM_HOLES = 0;
+
+ /**
+ * Input stream from which to read BWT data.
+ */
+ private final PrintStream out;
+
+ /**
+ * Create a new ANNWriter targeting the given file.
+ * @param file file into which ANN data should be written.
+ * @throws java.io.IOException if there is a problem opening the output file.
+ */
+ public AMBWriter(File file) throws IOException {
+ out = new PrintStream(file);
+ }
+
+ /**
+ * Create a new ANNWriter targeting the given OutputStream.
+ * @param stream Stream into which ANN data should be written.
+ */
+ public AMBWriter(OutputStream stream) {
+ out = new PrintStream(stream);
+ }
+
+ /**
+ * Write the contents of the given dictionary into the AMB file.
+ * Assumes that there are no holes in the dictionary.
+ * @param dictionary Dictionary to write.
+ */
+ public void writeEmpty(SAMSequenceDictionary dictionary) {
+ long genomeLength = 0L;
+ for(SAMSequenceRecord sequence: dictionary.getSequences())
+ genomeLength += sequence.getSequenceLength();
+
+ int sequences = dictionary.getSequences().size();
+
+ // Write the header
+ out.printf("%d %d %d%n",genomeLength,sequences,NUM_HOLES);
+ }
+
+ /**
+ * Close the given output stream.
+ */
+ public void close() {
+ out.close();
+ }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java
new file mode 100644
index 000000000..8d692a9e7
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/ANNWriter.java
@@ -0,0 +1,95 @@
+package org.broadinstitute.sting.alignment.reference.bwt;
+
+import net.sf.samtools.SAMSequenceDictionary;
+import net.sf.samtools.SAMSequenceRecord;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+
+/**
+ * Writes .ann files - an alternate sequence dictionary format
+ * used by BWA/C. For best results, the input sequence dictionary
+ * should be created with Picard's CreateSequenceDictionary.jar,
+ * TRUNCATE_NAMES_AT_WHITESPACE=false.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class ANNWriter {
+ /**
+ * BWA uses a fixed seed of 11, written into every file.
+ */
+ private static final int BNS_SEED = 11;
+
+ /**
+ * A seemingly unused value that appears in every contig in the ANN.
+ */
+ private static final int GI = 0;
+
+ /**
+ * Input stream from which to read BWT data.
+ */
+ private final PrintStream out;
+
+ /**
+ * Create a new ANNWriter targeting the given file.
+ * @param file file into which ANN data should be written.
+ * @throws IOException if there is a problem opening the output file.
+ */
+ public ANNWriter(File file) throws IOException {
+ out = new PrintStream(file);
+ }
+
+ /**
+ * Create a new ANNWriter targeting the given OutputStream.
+ * @param stream Stream into which ANN data should be written.
+ */
+ public ANNWriter(OutputStream stream) {
+ out = new PrintStream(stream);
+ }
+
+ /**
+ * Write the contents of the given dictionary into the ANN file.
+ * Assumes that no ambs (blocks of indeterminate base) are present in the dictionary.
+ * @param dictionary Dictionary to write.
+ */
+ public void write(SAMSequenceDictionary dictionary) {
+ long genomeLength = 0L;
+ for(SAMSequenceRecord sequence: dictionary.getSequences())
+ genomeLength += sequence.getSequenceLength();
+
+ int sequences = dictionary.getSequences().size();
+
+ // Write the header
+ out.printf("%d %d %d%n",genomeLength,sequences,BNS_SEED);
+
+ for(SAMSequenceRecord sequence: dictionary.getSequences()) {
+ String fullSequenceName = sequence.getSequenceName();
+ String trimmedSequenceName = fullSequenceName;
+ String sequenceComment = "(null)";
+
+ long offset = 0;
+
+ // Separate the sequence name from the sequence comment, based on BWA's definition.
+ // BWA's definition appears to accept a zero-length contig name, so mimic that behavior.
+ if(fullSequenceName.indexOf(' ') >= 0) {
+ trimmedSequenceName = fullSequenceName.substring(0,fullSequenceName.indexOf(' '));
+ sequenceComment = fullSequenceName.substring(fullSequenceName.indexOf(' ')+1);
+ }
+
+ // Write the sequence GI (?), name, and comment.
+ out.printf("%d %s %s%n",GI,trimmedSequenceName,sequenceComment);
+ // Write the sequence offset, length, and ambs (currently fixed at 0).
+ out.printf("%d %d %d%n",offset,sequence.getSequenceLength(),0);
+ }
+ }
+
+ /**
+ * Close the given output stream.
+ */
+ public void close() {
+ out.close();
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java
new file mode 100644
index 000000000..7f8c48253
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWT.java
@@ -0,0 +1,172 @@
+package org.broadinstitute.sting.alignment.reference.bwt;
+
+import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+/**
+ * Represents the Burrows-Wheeler Transform of a reference sequence.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWT {
+ /**
+ * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
+ * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
+ */
+ public static final int SEQUENCE_BLOCK_SIZE = 128;
+
+ /**
+ * The inverse SA, used as a placeholder for determining where the special EOL character sits.
+ */
+ protected final long inverseSA0;
+
+ /**
+ * Cumulative counts for the entire BWT.
+ */
+ protected final Counts counts;
+
+ /**
+ * The individual sequence blocks, modelling how they appear on disk.
+ */
+ protected final SequenceBlock[] sequenceBlocks;
+
+ /**
+ * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
+ * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
+ * @param counts Cumulative count of bases, in A,C,G,T order.
+ * @param sequenceBlocks The full BWT sequence, sans the '$'.
+ */
+ public BWT( long inverseSA0, Counts counts, SequenceBlock[] sequenceBlocks ) {
+ this.inverseSA0 = inverseSA0;
+ this.counts = counts;
+ this.sequenceBlocks = sequenceBlocks;
+ }
+
+ /**
+ * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
+ * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
+ * @param counts Count of bases, in A,C,G,T order.
+ * @param sequence The full BWT sequence, sans the '$'.
+ */
+ public BWT( long inverseSA0, Counts counts, byte[] sequence ) {
+ this(inverseSA0,counts,generateSequenceBlocks(sequence));
+ }
+
+ /**
+ * Extract the full sequence from the list of block.
+ * @return The full BWT string as a byte array.
+ */
+ public byte[] getSequence() {
+ byte[] sequence = new byte[(int)counts.getTotal()];
+ for( SequenceBlock block: sequenceBlocks )
+ System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
+ return sequence;
+ }
+
+ /**
+ * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
+ * @param base The base.
+ * @return Total counts for all bases lexicographically smaller than this base.
+ */
+ public long counts(byte base) {
+ return counts.getCumulative(base);
+ }
+
+ /**
+ * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
+ * @param base The base.
+ * @param index The position to search within the BWT.
+ * @return Total counts for all bases lexicographically smaller than this base.
+ */
+ public long occurrences(byte base,long index) {
+ SequenceBlock block = getSequenceBlock(index);
+ int position = getSequencePosition(index);
+ long accumulator = block.occurrences.get(base);
+ for(int i = 0; i <= position; i++) {
+ if(base == block.sequence[i])
+ accumulator++;
+ }
+ return accumulator;
+ }
+
+ /**
+ * The number of bases in the BWT as a whole.
+ * @return Number of bases.
+ */
+ public long length() {
+ return counts.getTotal();
+ }
+
+ /**
+ * Create a new BWT from the given reference sequence.
+ * @param referenceSequence Sequence from which to derive the BWT.
+ * @return reference sequence-derived BWT.
+ */
+ public static BWT createFromReferenceSequence(byte[] referenceSequence) {
+ SuffixArray suffixArray = SuffixArray.createFromReferenceSequence(referenceSequence);
+
+ byte[] bwt = new byte[(int)suffixArray.length()-1];
+ int bwtIndex = 0;
+ for(long suffixArrayIndex = 0; suffixArrayIndex < suffixArray.length(); suffixArrayIndex++) {
+ if(suffixArray.get(suffixArrayIndex) == 0)
+ continue;
+ bwt[bwtIndex++] = referenceSequence[(int)suffixArray.get(suffixArrayIndex)-1];
+ }
+
+ return new BWT(suffixArray.inverseSA0,suffixArray.occurrences,bwt);
+ }
+
+ /**
+ * Gets the base at a given position in the BWT.
+ * @param index The index to use.
+ * @return The base at that location.
+ */
+ protected byte getBase(long index) {
+ if(index == inverseSA0)
+ throw new ReviewedStingException(String.format("Base at index %d does not have a text representation",index));
+
+ SequenceBlock block = getSequenceBlock(index);
+ int position = getSequencePosition(index);
+ return block.sequence[position];
+ }
+
+ private SequenceBlock getSequenceBlock(long index) {
+ // If the index is above the SA-1[0], remap it to the appropriate coordinate space.
+ if(index > inverseSA0) index--;
+ return sequenceBlocks[(int)(index/SEQUENCE_BLOCK_SIZE)];
+ }
+
+ private int getSequencePosition(long index) {
+ // If the index is above the SA-1[0], remap it to the appropriate coordinate space.
+ if(index > inverseSA0) index--;
+ return (int)(index%SEQUENCE_BLOCK_SIZE);
+ }
+
+ /**
+ * Create a set of sequence blocks from one long sequence.
+ * @param sequence Sequence from which to derive blocks.
+ * @return Array of sequence blocks containing data from the sequence.
+ */
+ private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
+ Counts occurrences = new Counts();
+
+ int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
+ SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
+
+ for( int block = 0; block < numSequenceBlocks; block++ ) {
+ int blockStart = block*SEQUENCE_BLOCK_SIZE;
+ int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
+ byte[] subsequence = new byte[blockLength];
+
+ System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
+
+ sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,occurrences.clone(),subsequence);
+
+ for( byte base: subsequence )
+ occurrences.increment(base);
+ }
+
+ return sequenceBlocks;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java
new file mode 100644
index 000000000..5c4f6d39d
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTReader.java
@@ -0,0 +1,89 @@
+package org.broadinstitute.sting.alignment.reference.bwt;
+
+import org.broadinstitute.sting.alignment.reference.packing.BasePackedInputStream;
+import org.broadinstitute.sting.alignment.reference.packing.PackUtils;
+import org.broadinstitute.sting.alignment.reference.packing.UnsignedIntPackedInputStream;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteOrder;
+/**
+ * Reads a BWT from a given file.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWTReader {
+ /**
+ * Input stream from which to read BWT data.
+ */
+ private FileInputStream inputStream;
+
+ /**
+ * Create a new BWT reader.
+ * @param inputFile File in which the BWT is stored.
+ */
+ public BWTReader( File inputFile ) {
+ try {
+ this.inputStream = new FileInputStream(inputFile);
+ }
+ catch( FileNotFoundException ex ) {
+ throw new ReviewedStingException("Unable to open input file", ex);
+ }
+ }
+
+ /**
+ * Read a BWT from the input stream.
+ * @return The BWT stored in the input stream.
+ */
+ public BWT read() {
+ UnsignedIntPackedInputStream uintPackedInputStream = new UnsignedIntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
+ BasePackedInputStream basePackedInputStream = new BasePackedInputStream(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
+
+ long inverseSA0;
+ long[] count;
+ SequenceBlock[] sequenceBlocks;
+
+ try {
+ inverseSA0 = uintPackedInputStream.read();
+ count = new long[PackUtils.ALPHABET_SIZE];
+ uintPackedInputStream.read(count);
+
+ long bwtSize = count[PackUtils.ALPHABET_SIZE-1];
+ sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
+
+ for( int block = 0; block < sequenceBlocks.length; block++ ) {
+ int sequenceStart = block* BWT.SEQUENCE_BLOCK_SIZE;
+ int sequenceLength = (int)Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
+
+ long[] occurrences = new long[PackUtils.ALPHABET_SIZE];
+ byte[] bwt = new byte[sequenceLength];
+
+ uintPackedInputStream.read(occurrences);
+ basePackedInputStream.read(bwt);
+
+ sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,new Counts(occurrences,false),bwt);
+ }
+ }
+ catch( IOException ex ) {
+ throw new ReviewedStingException("Unable to read BWT from input stream.", ex);
+ }
+
+ return new BWT(inverseSA0, new Counts(count,true), sequenceBlocks);
+ }
+
+ /**
+ * Close the input stream.
+ */
+ public void close() {
+ try {
+ inputStream.close();
+ }
+ catch( IOException ex ) {
+ throw new ReviewedStingException("Unable to close input file", ex);
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java
new file mode 100644
index 000000000..3370f79c8
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/alignment/reference/bwt/BWTSupplementaryFileGenerator.java
@@ -0,0 +1,60 @@
+package org.broadinstitute.sting.alignment.reference.bwt;
+
+import net.sf.picard.reference.ReferenceSequenceFile;
+import net.sf.picard.reference.ReferenceSequenceFileFactory;
+import net.sf.samtools.SAMSequenceDictionary;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * Generate BWA supplementary files (.ann, .amb) from the command line.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class BWTSupplementaryFileGenerator {
+ enum SupplementaryFileType { ANN, AMB }
+
+ public static void main(String[] args) throws IOException {
+ if(args.length < 3)
+ usage("Incorrect number of arguments supplied");
+
+ File fastaFile = new File(args[0]);
+ File outputFile = new File(args[1]);
+ SupplementaryFileType outputType = null;
+ try {
+ outputType = Enum.valueOf(SupplementaryFileType.class,args[2]);
+ }
+ catch(IllegalArgumentException ex) {
+ usage("Invalid output type: " + args[2]);
+ }
+
+ ReferenceSequenceFile sequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(fastaFile);
+ SAMSequenceDictionary dictionary = sequenceFile.getSequenceDictionary();
+
+ switch(outputType) {
+ case ANN:
+ ANNWriter annWriter = new ANNWriter(outputFile);
+ annWriter.write(dictionary);
+ annWriter.close();
+ break;
+ case AMB:
+ AMBWriter ambWriter = new AMBWriter(outputFile);
+ ambWriter.writeEmpty(dictionary);
+ ambWriter.close();
+ break;
+ default:
+ usage("Unsupported output type: " + outputType);
+ }
+ }
+
+ /**
+ * Print usage information and exit.
+ */
+ private static void usage(String message) {
+ System.err.println(message);
+ System.err.println("Usage: BWTSupplementaryFileGenerator